HEX

File: //usr/src/x264-snapshot-20120103-2245-stable.tar
x264-snapshot-20120103-2245-stable/0000755000175000001440000000000011700673342015671 5ustar  videolanusersx264-snapshot-20120103-2245-stable/x264cli.h0000644000175000001440000000431711700673342017242 0ustar  videolanusers/*****************************************************************************
 * x264cli.h: x264cli common
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_CLI_H
#define X264_CLI_H

#include "common/common.h"

/* In microseconds */
#define UPDATE_INTERVAL 250000

typedef void *hnd_t;

static inline uint64_t gcd( uint64_t a, uint64_t b )
{
    while( 1 )
    {
        int64_t c = a % b;
        if( !c )
            return b;
        a = b;
        b = c;
    }
}

static inline uint64_t lcm( uint64_t a, uint64_t b )
{
    return ( a / gcd( a, b ) ) * b;
}

static inline char *get_filename_extension( char *filename )
{
    char *ext = filename + strlen( filename );
    while( *ext != '.' && ext > filename )
        ext--;
    ext += *ext == '.';
    return ext;
}

void x264_cli_log( const char *name, int i_level, const char *fmt, ... );
void x264_cli_printf( int i_level, const char *fmt, ... );

#define RETURN_IF_ERR( cond, name, ret, ... )\
if( cond )\
{\
    x264_cli_log( name, X264_LOG_ERROR, __VA_ARGS__ );\
    return ret;\
}

#define FAIL_IF_ERR( cond, name, ... ) RETURN_IF_ERR( cond, name, -1, __VA_ARGS__ )

#endif
x264-snapshot-20120103-2245-stable/x264.h0000644000175000001440000011601711700673342016553 0ustar  videolanusers/*****************************************************************************
 * x264.h: x264 public header
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_X264_H
#define X264_X264_H

#if !defined(_STDINT_H) && !defined(_STDINT_H_) && \
    !defined(_INTTYPES_H) && !defined(_INTTYPES_H_)
# ifdef _MSC_VER
#  pragma message("You must include stdint.h or inttypes.h before x264.h")
# else
#  warning You must include stdint.h or inttypes.h before x264.h
# endif
#endif

#include <stdarg.h>

#include "x264_config.h"

#define X264_BUILD 119

/* x264_t:
 *      opaque handler for encoder */
typedef struct x264_t x264_t;

/****************************************************************************
 * NAL structure and functions
 ****************************************************************************/

enum nal_unit_type_e
{
    NAL_UNKNOWN     = 0,
    NAL_SLICE       = 1,
    NAL_SLICE_DPA   = 2,
    NAL_SLICE_DPB   = 3,
    NAL_SLICE_DPC   = 4,
    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
    NAL_SEI         = 6,    /* ref_idc == 0 */
    NAL_SPS         = 7,
    NAL_PPS         = 8,
    NAL_AUD         = 9,
    NAL_FILLER      = 12,
    /* ref_idc == 0 for 6,9,10,11,12 */
};
enum nal_priority_e
{
    NAL_PRIORITY_DISPOSABLE = 0,
    NAL_PRIORITY_LOW        = 1,
    NAL_PRIORITY_HIGH       = 2,
    NAL_PRIORITY_HIGHEST    = 3,
};

/* The data within the payload is already NAL-encapsulated; the ref_idc and type
 * are merely in the struct for easy access by the calling application.
 * All data returned in an x264_nal_t, including the data in p_payload, is no longer
 * valid after the next call to x264_encoder_encode.  Thus it must be used or copied
 * before calling x264_encoder_encode or x264_encoder_headers again. */
typedef struct
{
    int i_ref_idc;  /* nal_priority_e */
    int i_type;     /* nal_unit_type_e */
    int b_long_startcode;
    int i_first_mb; /* If this NAL is a slice, the index of the first MB in the slice. */
    int i_last_mb;  /* If this NAL is a slice, the index of the last MB in the slice. */

    /* Size of payload in bytes. */
    int     i_payload;
    /* If param->b_annexb is set, Annex-B bytestream with startcode.
     * Otherwise, startcode is replaced with a 4-byte size.
     * This size is the size used in mp4/similar muxing; it is equal to i_payload-4 */
    uint8_t *p_payload;
} x264_nal_t;

/****************************************************************************
 * Encoder parameters
 ****************************************************************************/
/* CPU flags
 */
#define X264_CPU_CACHELINE_32    0x0000001  /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_64    0x0000002  /* 32/64 is the size of a cacheline in bytes */
#define X264_CPU_ALTIVEC         0x0000004
#define X264_CPU_MMX             0x0000008
#define X264_CPU_MMX2            0x0000010  /* MMX2 aka MMXEXT aka ISSE */
#define X264_CPU_MMXEXT          X264_CPU_MMX2
#define X264_CPU_SSE             0x0000020
#define X264_CPU_SSE2            0x0000040
#define X264_CPU_SSE2_IS_SLOW    0x0000080  /* avoid most SSE2 functions on Athlon64 */
#define X264_CPU_SSE2_IS_FAST    0x0000100  /* a few functions are only faster on Core2 and Phenom */
#define X264_CPU_SSE3            0x0000200
#define X264_CPU_SSSE3           0x0000400
#define X264_CPU_SHUFFLE_IS_FAST 0x0000800  /* Penryn, Nehalem, and Phenom have fast shuffle units */
#define X264_CPU_STACK_MOD4      0x0001000  /* if stack is only mod4 and not mod16 */
#define X264_CPU_SSE4            0x0002000  /* SSE4.1 */
#define X264_CPU_SSE42           0x0004000  /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN    0x0008000  /* Phenom support for misaligned SSE instruction arguments */
#define X264_CPU_LZCNT           0x0010000  /* Phenom support for "leading zero count" instruction. */
#define X264_CPU_ARMV6           0x0020000
#define X264_CPU_NEON            0x0040000  /* ARM NEON */
#define X264_CPU_FAST_NEON_MRC   0x0080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
#define X264_CPU_SLOW_CTZ        0x0100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
#define X264_CPU_SLOW_ATOM       0x0200000  /* The Atom just sucks */
#define X264_CPU_AVX             0x0400000  /* AVX support: requires OS support even if YMM registers
                                             * aren't used. */
#define X264_CPU_XOP             0x0800000  /* AMD XOP */
#define X264_CPU_FMA4            0x1000000  /* AMD FMA4 */

/* Analyse flags
 */
#define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
#define X264_ANALYSE_I8x8       0x0002  /* Analyse i8x8 (requires 8x8 transform) */
#define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
#define X264_ANALYSE_PSUB8x8    0x0020  /* Analyse p8x4, p4x8, p4x4 */
#define X264_ANALYSE_BSUB16x16  0x0100  /* Analyse b16x8, b8x16 and b8x8 */
#define X264_DIRECT_PRED_NONE        0
#define X264_DIRECT_PRED_SPATIAL     1
#define X264_DIRECT_PRED_TEMPORAL    2
#define X264_DIRECT_PRED_AUTO        3
#define X264_ME_DIA                  0
#define X264_ME_HEX                  1
#define X264_ME_UMH                  2
#define X264_ME_ESA                  3
#define X264_ME_TESA                 4
#define X264_CQM_FLAT                0
#define X264_CQM_JVT                 1
#define X264_CQM_CUSTOM              2
#define X264_RC_CQP                  0
#define X264_RC_CRF                  1
#define X264_RC_ABR                  2
#define X264_QP_AUTO                 0
#define X264_AQ_NONE                 0
#define X264_AQ_VARIANCE             1
#define X264_AQ_AUTOVARIANCE         2
#define X264_B_ADAPT_NONE            0
#define X264_B_ADAPT_FAST            1
#define X264_B_ADAPT_TRELLIS         2
#define X264_WEIGHTP_NONE            0
#define X264_WEIGHTP_SIMPLE          1
#define X264_WEIGHTP_SMART           2
#define X264_B_PYRAMID_NONE          0
#define X264_B_PYRAMID_STRICT        1
#define X264_B_PYRAMID_NORMAL        2
#define X264_KEYINT_MIN_AUTO         0
#define X264_KEYINT_MAX_INFINITE     (1<<30)

static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
static const char * const x264_b_pyramid_names[] = { "none", "strict", "normal", 0 };
static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 };
static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
static const char * const x264_fullrange_names[] = { "off", "on", 0 };
static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", 0 };
static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", 0 };
static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 };
static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };

/* Colorspace type */
#define X264_CSP_MASK           0x00ff  /* */
#define X264_CSP_NONE           0x0000  /* Invalid mode     */
#define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
#define X264_CSP_YV12           0x0002  /* yvu 4:2:0 planar */
#define X264_CSP_NV12           0x0003  /* yuv 4:2:0, with one y plane and one packed u+v */
#define X264_CSP_I422           0x0004  /* yuv 4:2:2 planar */
#define X264_CSP_YV16           0x0005  /* yvu 4:2:2 planar */
#define X264_CSP_NV16           0x0006  /* yuv 4:2:2, with one y plane and one packed u+v */
#define X264_CSP_I444           0x0007  /* yuv 4:4:4 planar */
#define X264_CSP_YV24           0x0008  /* yvu 4:4:4 planar */
#define X264_CSP_BGR            0x0009  /* packed bgr 24bits   */
#define X264_CSP_BGRA           0x000a  /* packed bgr 32bits   */
#define X264_CSP_RGB            0x000b  /* packed rgb 24bits   */
#define X264_CSP_MAX            0x000c  /* end of list */
#define X264_CSP_VFLIP          0x1000  /* the csp is vertically flipped */
#define X264_CSP_HIGH_DEPTH     0x2000  /* the csp has a depth of 16 bits per pixel component */

/* Slice type */
#define X264_TYPE_AUTO          0x0000  /* Let x264 choose the right type */
#define X264_TYPE_IDR           0x0001
#define X264_TYPE_I             0x0002
#define X264_TYPE_P             0x0003
#define X264_TYPE_BREF          0x0004  /* Non-disposable B-frame */
#define X264_TYPE_B             0x0005
#define X264_TYPE_KEYFRAME      0x0006  /* IDR or I depending on b_open_gop option */
#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR)
#define IS_X264_TYPE_B(x) ((x)==X264_TYPE_B || (x)==X264_TYPE_BREF)

/* Log level */
#define X264_LOG_NONE          (-1)
#define X264_LOG_ERROR          0
#define X264_LOG_WARNING        1
#define X264_LOG_INFO           2
#define X264_LOG_DEBUG          3

/* Threading */
#define X264_THREADS_AUTO 0 /* Automatically select optimal number of threads */
#define X264_SYNC_LOOKAHEAD_AUTO (-1) /* Automatically select optimal lookahead thread buffer size */

/* HRD */
#define X264_NAL_HRD_NONE            0
#define X264_NAL_HRD_VBR             1
#define X264_NAL_HRD_CBR             2

/* Zones: override ratecontrol or other options for specific sections of the video.
 * See x264_encoder_reconfig() for which options can be changed.
 * If zones overlap, whichever comes later in the list takes precedence. */
typedef struct
{
    int i_start, i_end; /* range of frame numbers */
    int b_force_qp; /* whether to use qp vs bitrate factor */
    int i_qp;
    float f_bitrate_factor;
    struct x264_param_t *param;
} x264_zone_t;

typedef struct x264_param_t
{
    /* CPU flags */
    unsigned int cpu;
    int         i_threads;       /* encode multiple frames in parallel */
    int         b_sliced_threads;  /* Whether to use slice-based threading. */
    int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
    int         b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */
    int         i_sync_lookahead; /* threaded lookahead buffer */

    /* Video Properties */
    int         i_width;
    int         i_height;
    int         i_csp;         /* CSP of encoded bitstream */
    int         i_level_idc;
    int         i_frame_total; /* number of frames to encode if known, else 0 */

    /* NAL HRD
     * Uses Buffering and Picture Timing SEIs to signal HRD
     * The HRD in H.264 was not designed with VFR in mind.
     * It is therefore not recommendeded to use NAL HRD with VFR.
     * Furthermore, reconfiguring the VBV (via x264_encoder_reconfig)
     * will currently generate invalid HRD. */
    int         i_nal_hrd;

    struct
    {
        /* they will be reduced to be 0 < x <= 65535 and prime */
        int         i_sar_height;
        int         i_sar_width;

        int         i_overscan;    /* 0=undef, 1=no overscan, 2=overscan */

        /* see h264 annex E for the values of the following */
        int         i_vidformat;
        int         b_fullrange;
        int         i_colorprim;
        int         i_transfer;
        int         i_colmatrix;
        int         i_chroma_loc;    /* both top & bottom */
    } vui;

    /* Bitstream parameters */
    int         i_frame_reference;  /* Maximum number of reference frames */
    int         i_dpb_size;         /* Force a DPB size larger than that implied by B-frames and reference frames.
                                     * Useful in combination with interactive error resilience. */
    int         i_keyint_max;       /* Force an IDR keyframe at this interval */
    int         i_keyint_min;       /* Scenecuts closer together than this are coded as I, not IDR. */
    int         i_scenecut_threshold; /* how aggressively to insert extra I frames */
    int         b_intra_refresh;    /* Whether or not to use periodic intra refresh instead of IDR frames. */

    int         i_bframe;   /* how many b-frame between 2 references pictures */
    int         i_bframe_adaptive;
    int         i_bframe_bias;
    int         i_bframe_pyramid;   /* Keep some B-frames as references: 0=off, 1=strict hierarchical, 2=normal */
    int         b_open_gop;
    int         b_bluray_compat;

    int         b_deblocking_filter;
    int         i_deblocking_filter_alphac0;    /* [-6, 6] -6 light filter, 6 strong */
    int         i_deblocking_filter_beta;       /* [-6, 6]  idem */

    int         b_cabac;
    int         i_cabac_init_idc;

    int         b_interlaced;
    int         b_constrained_intra;

    int         i_cqm_preset;
    char        *psz_cqm_file;      /* JM format */
    uint8_t     cqm_4iy[16];        /* used only if i_cqm_preset == X264_CQM_CUSTOM */
    uint8_t     cqm_4py[16];
    uint8_t     cqm_4ic[16];
    uint8_t     cqm_4pc[16];
    uint8_t     cqm_8iy[64];
    uint8_t     cqm_8py[64];
    uint8_t     cqm_8ic[64];
    uint8_t     cqm_8pc[64];

    /* Log */
    void        (*pf_log)( void *, int i_level, const char *psz, va_list );
    void        *p_log_private;
    int         i_log_level;
    int         b_visualize;
    char        *psz_dump_yuv;  /* filename for reconstructed frames */

    /* Encoder analyser parameters */
    struct
    {
        unsigned int intra;     /* intra partitions */
        unsigned int inter;     /* inter partitions */

        int          b_transform_8x8;
        int          i_weighted_pred; /* weighting for P-frames */
        int          b_weighted_bipred; /* implicit weighting for B-frames */
        int          i_direct_mv_pred; /* spatial vs temporal mv prediction */
        int          i_chroma_qp_offset;

        int          i_me_method; /* motion estimation algorithm to use (X264_ME_*) */
        int          i_me_range; /* integer pixel motion estimation search range (from predicted mv) */
        int          i_mv_range; /* maximum length of a mv (in pixels). -1 = auto, based on level */
        int          i_mv_range_thread; /* minimum space between threads. -1 = auto, based on number of threads. */
        int          i_subpel_refine; /* subpixel motion estimation quality */
        int          b_chroma_me; /* chroma ME for subpel and mode decision in P-frames */
        int          b_mixed_references; /* allow each mb partition to have its own reference number */
        int          i_trellis;  /* trellis RD quantization */
        int          b_fast_pskip; /* early SKIP detection on P-frames */
        int          b_dct_decimate; /* transform coefficient thresholding on P-frames */
        int          i_noise_reduction; /* adaptive pseudo-deadzone */
        float        f_psy_rd; /* Psy RD strength */
        float        f_psy_trellis; /* Psy trellis strength */
        int          b_psy; /* Toggle all psy optimizations */

        /* the deadzone size that will be used in luma quantization */
        int          i_luma_deadzone[2]; /* {inter, intra} */

        int          b_psnr;    /* compute and print PSNR stats */
        int          b_ssim;    /* compute and print SSIM stats */
    } analyse;

    /* Rate control parameters */
    struct
    {
        int         i_rc_method;    /* X264_RC_* */

        int         i_qp_constant;  /* 0 to (51 + 6*(x264_bit_depth-8)). 0=lossless */
        int         i_qp_min;       /* min allowed QP value */
        int         i_qp_max;       /* max allowed QP value */
        int         i_qp_step;      /* max QP step between frames */

        int         i_bitrate;
        float       f_rf_constant;  /* 1pass VBR, nominal QP */
        float       f_rf_constant_max;  /* In CRF mode, maximum CRF as caused by VBV */
        float       f_rate_tolerance;
        int         i_vbv_max_bitrate;
        int         i_vbv_buffer_size;
        float       f_vbv_buffer_init; /* <=1: fraction of buffer_size. >1: kbit */
        float       f_ip_factor;
        float       f_pb_factor;

        int         i_aq_mode;      /* psy adaptive QP. (X264_AQ_*) */
        float       f_aq_strength;
        int         b_mb_tree;      /* Macroblock-tree ratecontrol. */
        int         i_lookahead;

        /* 2pass */
        int         b_stat_write;   /* Enable stat writing in psz_stat_out */
        char        *psz_stat_out;
        int         b_stat_read;    /* Read stat from psz_stat_in and use it */
        char        *psz_stat_in;

        /* 2pass params (same as ffmpeg ones) */
        float       f_qcompress;    /* 0.0 => cbr, 1.0 => constant qp */
        float       f_qblur;        /* temporally blur quants */
        float       f_complexity_blur; /* temporally blur complexity */
        x264_zone_t *zones;         /* ratecontrol overrides */
        int         i_zones;        /* number of zone_t's */
        char        *psz_zones;     /* alternate method of specifying zones */
    } rc;

    /* Cropping Rectangle parameters: added to those implicitly defined by
       non-mod16 video resolutions. */
    struct
    {
        unsigned int i_left;
        unsigned int i_top;
        unsigned int i_right;
        unsigned int i_bottom;
    } crop_rect;

    /* frame packing arrangement flag */
    int i_frame_packing;

    /* Muxing parameters */
    int b_aud;                  /* generate access unit delimiters */
    int b_repeat_headers;       /* put SPS/PPS before each keyframe */
    int b_annexb;               /* if set, place start codes (4 bytes) before NAL units,
                                 * otherwise place size (4 bytes) before NAL units. */
    int i_sps_id;               /* SPS and PPS id number */
    int b_vfr_input;            /* VFR input.  If 1, use timebase and timestamps for ratecontrol purposes.
                                 * If 0, use fps only. */
    int b_pulldown;             /* use explicity set timebase for CFR */
    uint32_t i_fps_num;
    uint32_t i_fps_den;
    uint32_t i_timebase_num;    /* Timebase numerator */
    uint32_t i_timebase_den;    /* Timebase denominator */

    int b_tff;

    /* Pulldown:
     * The correct pic_struct must be passed with each input frame.
     * The input timebase should be the timebase corresponding to the output framerate. This should be constant.
     * e.g. for 3:2 pulldown timebase should be 1001/30000
     * The PTS passed with each frame must be the PTS of the frame after pulldown is applied.
     * Frame doubling and tripling require b_vfr_input set to zero (see H.264 Table D-1)
     *
     * Pulldown changes are not clearly defined in H.264. Therefore, it is the calling app's responsibility to manage this.
     */

    int b_pic_struct;

    /* Fake Interlaced.
     *
     * Used only when b_interlaced=0. Setting this flag makes it possible to flag the stream as PAFF interlaced yet
     * encode all frames progessively. It is useful for encoding 25p and 30p Blu-Ray streams.
     */

    int b_fake_interlaced;

    /* Slicing parameters */
    int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
    int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */
    int i_slice_count;       /* Number of slices per frame: forces rectangular slices. */

    /* Optional callback for freeing this x264_param_t when it is done being used.
     * Only used when the x264_param_t sits in memory for an indefinite period of time,
     * i.e. when an x264_param_t is passed to x264_t in an x264_picture_t or in zones.
     * Not used when x264_encoder_reconfig is called directly. */
    void (*param_free)( void* );

    /* Optional low-level callback for low-latency encoding.  Called for each output NAL unit
     * immediately after the NAL unit is finished encoding.  This allows the calling application
     * to begin processing video data (e.g. by sending packets over a network) before the frame
     * is done encoding.
     *
     * This callback MUST do the following in order to work correctly:
     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16.
     * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer.
     * After these steps, the content of nal is valid and can be used in the same way as if
     * the NAL unit were output by x264_encoder_encode.
     *
     * This does not need to be synchronous with the encoding process: the data pointed to
     * by nal (both before and after x264_nal_encode) will remain valid until the next
     * x264_encoder_encode call.  The callback must be re-entrant.
     *
     * This callback does not work with frame-based threads; threads must be disabled
     * or sliced-threads enabled.  This callback also does not work as one would expect
     * with HRD -- since the buffering period SEI cannot be calculated until the frame
     * is finished encoding, it will not be sent via this callback.
     *
     * Note also that the NALs are not necessarily returned in order when sliced threads is
     * enabled.  Accordingly, the variable i_first_mb and i_last_mb are available in
     * x264_nal_t to help the calling application reorder the slices if necessary.
     *
     * When this callback is enabled, x264_encoder_encode does not return valid NALs;
     * the calling application is expected to acquire all output NALs through the callback.
     *
     * It is generally sensible to combine this callback with a use of slice-max-mbs or
     * slice-max-size. */
    void (*nalu_process) ( x264_t *h, x264_nal_t *nal );
} x264_param_t;

void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );

/****************************************************************************
 * H.264 level restriction information
 ****************************************************************************/

typedef struct
{
    int level_idc;
    int mbps;        /* max macroblock processing rate (macroblocks/sec) */
    int frame_size;  /* max frame size (macroblocks) */
    int dpb;         /* max decoded picture buffer (bytes) */
    int bitrate;     /* max bitrate (kbit/sec) */
    int cpb;         /* max vbv buffer (kbit) */
    int mv_range;    /* max vertical mv component range (pixels) */
    int mvs_per_2mb; /* max mvs per 2 consecutive mbs. */
    int slice_rate;  /* ?? */
    int mincr;       /* min compression ratio */
    int bipred8x8;   /* limit bipred to >=8x8 */
    int direct8x8;   /* limit b_direct to >=8x8 */
    int frame_only;  /* forbid interlacing */
} x264_level_t;

/* all of the levels defined in the standard, terminated by .level_idc=0 */
extern const x264_level_t x264_levels[];

/****************************************************************************
 * Basic parameter handling functions
 ****************************************************************************/

/* x264_param_default:
 *      fill x264_param_t with default values and do CPU detection */
void    x264_param_default( x264_param_t * );

/* x264_param_parse:
 *  set one parameter by name.
 *  returns 0 on success, or returns one of the following errors.
 *  note: BAD_VALUE occurs only if it can't even parse the value,
 *  numerical range is not checked until x264_encoder_open() or
 *  x264_encoder_reconfig().
 *  value=NULL means "true" for boolean options, but is a BAD_VALUE for non-booleans. */
#define X264_PARAM_BAD_NAME  (-1)
#define X264_PARAM_BAD_VALUE (-2)
int x264_param_parse( x264_param_t *, const char *name, const char *value );

/****************************************************************************
 * Advanced parameter handling functions
 ****************************************************************************/

/* These functions expose the full power of x264's preset-tune-profile system for
 * easy adjustment of large numbers of internal parameters.
 *
 * In order to replicate x264CLI's option handling, these functions MUST be called
 * in the following order:
 * 1) x264_param_default_preset
 * 2) Custom user options (via param_parse or directly assigned variables)
 * 3) x264_param_apply_fastfirstpass
 * 4) x264_param_apply_profile
 *
 * Additionally, x264CLI does not apply step 3 if the preset chosen is "placebo"
 * or --slow-firstpass is set. */

/* x264_param_default_preset:
 *      The same as x264_param_default, but also use the passed preset and tune
 *      to modify the default settings.
 *      (either can be NULL, which implies no preset or no tune, respectively)
 *
 *      Currently available presets are, ordered from fastest to slowest: */
static const char * const x264_preset_names[] = { "ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow", "placebo", 0 };

/*      The presets can also be indexed numerically, as in:
 *      x264_param_default_preset( &param, "3", ... )
 *      with ultrafast mapping to "0" and placebo mapping to "9".  This mapping may
 *      of course change if new presets are added in between, but will always be
 *      ordered from fastest to slowest.
 *
 *      Warning: the speed of these presets scales dramatically.  Ultrafast is a full
 *      100 times faster than placebo!
 *
 *      Currently available tunings are: */
static const char * const x264_tune_names[] = { "film", "animation", "grain", "stillimage", "psnr", "ssim", "fastdecode", "zerolatency", 0 };

/*      Multiple tunings can be used if separated by a delimiter in ",./-+",
 *      however multiple psy tunings cannot be used.
 *      film, animation, grain, stillimage, psnr, and ssim are psy tunings.
 *
 *      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
int     x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );

/* x264_param_apply_fastfirstpass:
 *      If first-pass mode is set (rc.b_stat_read == 0, rc.b_stat_write == 1),
 *      modify the encoder settings to disable options generally not useful on
 *      the first pass. */
void    x264_param_apply_fastfirstpass( x264_param_t * );

/* x264_param_apply_profile:
 *      Applies the restrictions of the given profile.
 *      Currently available profiles are, from most to least restrictive: */
static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", "high422", "high444", 0 };

/*      (can be NULL, in which case the function will do nothing)
 *
 *      Does NOT guarantee that the given profile will be used: if the restrictions
 *      of "High" are applied to settings that are already Baseline-compatible, the
 *      stream will remain baseline.  In short, it does not increase settings, only
 *      decrease them.
 *
 *      returns 0 on success, negative on failure (e.g. invalid profile name). */
int     x264_param_apply_profile( x264_param_t *, const char *profile );

/****************************************************************************
 * Picture structures and functions
 ****************************************************************************/

/* x264_bit_depth:
 *      Specifies the number of bits per pixel that x264 uses. This is also the
 *      bit depth that x264 encodes in. If this value is > 8, x264 will read
 *      two bytes of input data for each pixel sample, and expect the upper
 *      (16-x264_bit_depth) bits to be zero.
 *      Note: The flag X264_CSP_HIGH_DEPTH must be used to specify the
 *      colorspace depth as well. */
extern const int x264_bit_depth;

/* x264_chroma_format:
 *      Specifies the chroma formats that x264 supports encoding. When this
 *      value is non-zero, then it represents a X264_CSP_* that is the only
 *      chroma format that x264 supports encoding. If the value is 0 then
 *      there are no restrictions. */
extern const int x264_chroma_format;

enum pic_struct_e
{
    PIC_STRUCT_AUTO              = 0, // automatically decide (default)
    PIC_STRUCT_PROGRESSIVE       = 1, // progressive frame
    // "TOP" and "BOTTOM" are not supported in x264 (PAFF only)
    PIC_STRUCT_TOP_BOTTOM        = 4, // top field followed by bottom
    PIC_STRUCT_BOTTOM_TOP        = 5, // bottom field followed by top
    PIC_STRUCT_TOP_BOTTOM_TOP    = 6, // top field, bottom field, top field repeated
    PIC_STRUCT_BOTTOM_TOP_BOTTOM = 7, // bottom field, top field, bottom field repeated
    PIC_STRUCT_DOUBLE            = 8, // double frame
    PIC_STRUCT_TRIPLE            = 9, // triple frame
};

typedef struct
{
    double cpb_initial_arrival_time;
    double cpb_final_arrival_time;
    double cpb_removal_time;

    double dpb_output_time;
} x264_hrd_t;

/* Arbitrary user SEI:
 * Payload size is in bytes and the payload pointer must be valid.
 * Payload types and syntax can be found in Annex D of the H.264 Specification.
 * SEI payload alignment bits as described in Annex D must be included at the
 * end of the payload if needed.
 * The payload should not be NAL-encapsulated.
 * Payloads are written first in order of input, apart from in the case when HRD
 * is enabled where payloads are written after the Buffering Period SEI. */

typedef struct
{
    int payload_size;
    int payload_type;
    uint8_t *payload;
} x264_sei_payload_t;

typedef struct
{
    int num_payloads;
    x264_sei_payload_t *payloads;
    /* In: optional callback to free each payload AND x264_sei_payload_t when used. */
    void (*sei_free)( void* );
} x264_sei_t;

typedef struct
{
    int     i_csp;       /* Colorspace */
    int     i_plane;     /* Number of image planes */
    int     i_stride[4]; /* Strides for each plane */
    uint8_t *plane[4];   /* Pointers to each plane */
} x264_image_t;

typedef struct
{
    /* In: an array of quantizer offsets to be applied to this image during encoding.
     *     These are added on top of the decisions made by x264.
     *     Offsets can be fractional; they are added before QPs are rounded to integer.
     *     Adaptive quantization must be enabled to use this feature.  Behavior if quant
     *     offsets differ between encoding passes is undefined.
     *
     *     Array contains one offset per macroblock, in raster scan order.  In interlaced
     *     mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
    float *quant_offsets;
    /* In: optional callback to free quant_offsets when used.
     *     Useful if one wants to use a different quant_offset array for each frame. */
    void (*quant_offsets_free)( void* );
} x264_image_properties_t;

typedef struct
{
    /* In: force picture type (if not auto)
     *     If x264 encoding parameters are violated in the forcing of picture types,
     *     x264 will correct the input picture type and log a warning.
     *     The quality of frametype decisions may suffer if a great deal of fine-grained
     *     mixing of auto and forced frametypes is done.
     * Out: type of the picture encoded */
    int     i_type;
    /* In: force quantizer for != X264_QP_AUTO */
    int     i_qpplus1;
    /* In: pic_struct, for pulldown/doubling/etc...used only if b_pic_struct=1.
     *     use pic_struct_e for pic_struct inputs
     * Out: pic_struct element associated with frame */
    int     i_pic_struct;
    /* Out: whether this frame is a keyframe.  Important when using modes that result in
     * SEI recovery points being used instead of IDR frames. */
    int     b_keyframe;
    /* In: user pts, Out: pts of encoded picture (user)*/
    int64_t i_pts;
    /* Out: frame dts. When the pts of the first frame is close to zero,
     *      initial frames may have a negative dts which must be dealt with by any muxer */
    int64_t i_dts;
    /* In: custom encoding parameters to be set from this frame forwards
           (in coded order, not display order). If NULL, continue using
           parameters from the previous frame.  Some parameters, such as
           aspect ratio, can only be changed per-GOP due to the limitations
           of H.264 itself; in this case, the caller must force an IDR frame
           if it needs the changed parameter to apply immediately. */
    x264_param_t *param;
    /* In: raw data */
    x264_image_t img;
    /* In: optional information to modify encoder decisions for this frame */
    x264_image_properties_t prop;
    /* Out: HRD timing information. Output only when i_nal_hrd is set. */
    x264_hrd_t hrd_timing;
    /* In: arbitrary user SEI (e.g subtitles, AFDs) */
    x264_sei_t extra_sei;
    /* private user data. libx264 doesn't touch this,
       not even copy it from input to output frames. */
    void *opaque;
} x264_picture_t;

/* x264_picture_init:
 *  initialize an x264_picture_t.  Needs to be done if the calling application
 *  allocates its own x264_picture_t as opposed to using x264_picture_alloc. */
void x264_picture_init( x264_picture_t *pic );

/* x264_picture_alloc:
 *  alloc data for a picture. You must call x264_picture_clean on it.
 *  returns 0 on success, or -1 on malloc failure or invalid colorspace. */
int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );

/* x264_picture_clean:
 *  free associated resource for a x264_picture_t allocated with
 *  x264_picture_alloc ONLY */
void x264_picture_clean( x264_picture_t *pic );

/****************************************************************************
 * Encoder functions
 ****************************************************************************/

/* Force a link error in the case of linking against an incompatible API version.
 * Glue #defines exist to force correct macro expansion; the final output of the macro
 * is x264_encoder_open_##X264_BUILD (for purposes of dlopen). */
#define x264_encoder_glue1(x,y) x##y
#define x264_encoder_glue2(x,y) x264_encoder_glue1(x,y)
#define x264_encoder_open x264_encoder_glue2(x264_encoder_open_,X264_BUILD)

/* x264_encoder_open:
 *      create a new encoder handler, all parameters from x264_param_t are copied */
x264_t *x264_encoder_open( x264_param_t * );

/* x264_encoder_reconfig:
 *      various parameters from x264_param_t are copied.
 *      this takes effect immediately, on whichever frame is encoded next;
 *      due to delay, this may not be the next frame passed to encoder_encode.
 *      if the change should apply to some particular frame, use x264_picture_t->param instead.
 *      returns 0 on success, negative on parameter validation error.
 *      not all parameters can be changed; see the actual function for a detailed breakdown. */
int     x264_encoder_reconfig( x264_t *, x264_param_t * );
/* x264_encoder_parameters:
 *      copies the current internal set of parameters to the pointer provided
 *      by the caller.  useful when the calling application needs to know
 *      how x264_encoder_open has changed the parameters, or the current state
 *      of the encoder after multiple x264_encoder_reconfig calls.
 *      note that the data accessible through pointers in the returned param struct
 *      (e.g. filenames) should not be modified by the calling application. */
void    x264_encoder_parameters( x264_t *, x264_param_t * );
/* x264_encoder_headers:
 *      return the SPS and PPS that will be used for the whole stream.
 *      *pi_nal is the number of NAL units outputted in pp_nal.
 *      returns negative on error.
 *      the payloads of all output NALs are guaranteed to be sequential in memory. */
int     x264_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
/* x264_encoder_encode:
 *      encode one picture.
 *      *pi_nal is the number of NAL units outputted in pp_nal.
 *      returns negative on error, zero if no NAL units returned.
 *      the payloads of all output NALs are guaranteed to be sequential in memory. */
int     x264_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
/* x264_encoder_close:
 *      close an encoder handler */
void    x264_encoder_close  ( x264_t * );
/* x264_encoder_delayed_frames:
 *      return the number of currently delayed (buffered) frames
 *      this should be used at the end of the stream, to know when you have all the encoded frames. */
int     x264_encoder_delayed_frames( x264_t * );
/* x264_encoder_maximum_delayed_frames( x264_t *h ):
 *      return the maximum number of delayed (buffered) frames that can occur with the current
 *      parameters. */
int     x264_encoder_maximum_delayed_frames( x264_t *h );
/* x264_encoder_intra_refresh:
 *      If an intra refresh is not in progress, begin one with the next P-frame.
 *      If an intra refresh is in progress, begin one as soon as the current one finishes.
 *      Requires that b_intra_refresh be set.
 *
 *      Useful for interactive streaming where the client can tell the server that packet loss has
 *      occurred.  In this case, keyint can be set to an extremely high value so that intra refreshes
 *      only occur when calling x264_encoder_intra_refresh.
 *
 *      In multi-pass encoding, if x264_encoder_intra_refresh is called differently in each pass,
 *      behavior is undefined.
 *
 *      Should not be called during an x264_encoder_encode. */
void    x264_encoder_intra_refresh( x264_t * );
/* x264_encoder_invalidate_reference:
 *      An interactive error resilience tool, designed for use in a low-latency one-encoder-few-clients
 *      system.  When the client has packet loss or otherwise incorrectly decodes a frame, the encoder
 *      can be told with this command to "forget" the frame and all frames that depend on it, referencing
 *      only frames that occurred before the loss.  This will force a keyframe if no frames are left to
 *      reference after the aforementioned "forgetting".
 *
 *      It is strongly recommended to use a large i_dpb_size in this case, which allows the encoder to
 *      keep around extra, older frames to fall back on in case more recent frames are all invalidated.
 *      Unlike increasing i_frame_reference, this does not increase the number of frames used for motion
 *      estimation and thus has no speed impact.  It is also recommended to set a very large keyframe
 *      interval, so that keyframes are not used except as necessary for error recovery.
 *
 *      x264_encoder_invalidate_reference is not currently compatible with the use of B-frames or intra
 *      refresh.
 *
 *      In multi-pass encoding, if x264_encoder_invalidate_reference is called differently in each pass,
 *      behavior is undefined.
 *
 *      Should not be called during an x264_encoder_encode, but multiple calls can be made simultaneously.
 *
 *      Returns 0 on success, negative on failure. */
int x264_encoder_invalidate_reference( x264_t *, int64_t pts );

#endif
x264-snapshot-20120103-2245-stable/x264.c0000644000175000001440000023666111700673342016556 0ustar  videolanusers/*****************************************************************************
 * x264: top-level x264cli functions
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Steven Walters <kemuri9@gmail.com>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Kieran Kunhya <kieran@kunhya.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include <signal.h>
#define _GNU_SOURCE
#include <getopt.h>
#include "common/common.h"
#include "x264cli.h"
#include "input/input.h"
#include "output/output.h"
#include "filters/filters.h"

#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "x264", __VA_ARGS__ )

#ifdef _WIN32
#include <windows.h>
#else
#define GetConsoleTitle(t,n)
#define SetConsoleTitle(t)
#endif

#if HAVE_LAVF
#undef DECLARE_ALIGNED
#include <libavformat/avformat.h>
#include <libavutil/pixfmt.h>
#include <libavutil/pixdesc.h>
#endif

#if HAVE_SWSCALE
#undef DECLARE_ALIGNED
#include <libswscale/swscale.h>
#endif

#if HAVE_FFMS
#include <ffms.h>
#endif

/* Ctrl-C handler */
static volatile int b_ctrl_c = 0;
static int          b_exit_on_ctrl_c = 0;
static void sigint_handler( int a )
{
    if( b_exit_on_ctrl_c )
        exit(0);
    b_ctrl_c = 1;
}

static char UNUSED originalCTitle[200] = "";

typedef struct {
    int b_progress;
    int i_seek;
    hnd_t hin;
    hnd_t hout;
    FILE *qpfile;
    FILE *tcfile_out;
    double timebase_convert_multiplier;
    int i_pulldown;
} cli_opt_t;

/* file i/o operation structs */
cli_input_t cli_input;
static cli_output_t cli_output;

/* video filter operation struct */
static cli_vid_filter_t filter;

static const char * const demuxer_names[] =
{
    "auto",
    "raw",
    "y4m",
#if HAVE_AVS
    "avs",
#endif
#if HAVE_LAVF
    "lavf",
#endif
#if HAVE_FFMS
    "ffms",
#endif
    0
};

static const char * const muxer_names[] =
{
    "auto",
    "raw",
    "mkv",
    "flv",
#if HAVE_GPAC
    "mp4",
#endif
    0
};

static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 };
static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 };
static const char * const output_csp_names[] =
{
#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I420
    "i420",
#endif
#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I422
    "i422",
#endif
#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I444
    "i444", "rgb",
#endif
    0
};

typedef struct
{
    int mod;
    uint8_t pattern[24];
    float fps_factor;
} cli_pulldown_t;

enum pulldown_type_e
{
    X264_PULLDOWN_22 = 1,
    X264_PULLDOWN_32,
    X264_PULLDOWN_64,
    X264_PULLDOWN_DOUBLE,
    X264_PULLDOWN_TRIPLE,
    X264_PULLDOWN_EURO
};

#define TB  PIC_STRUCT_TOP_BOTTOM
#define BT  PIC_STRUCT_BOTTOM_TOP
#define TBT PIC_STRUCT_TOP_BOTTOM_TOP
#define BTB PIC_STRUCT_BOTTOM_TOP_BOTTOM

static const cli_pulldown_t pulldown_values[] =
{
    [X264_PULLDOWN_22]     = {1,  {TB},                                   1.0},
    [X264_PULLDOWN_32]     = {4,  {TBT, BT, BTB, TB},                     1.25},
    [X264_PULLDOWN_64]     = {2,  {PIC_STRUCT_DOUBLE, PIC_STRUCT_TRIPLE}, 1.0},
    [X264_PULLDOWN_DOUBLE] = {1,  {PIC_STRUCT_DOUBLE},                    2.0},
    [X264_PULLDOWN_TRIPLE] = {1,  {PIC_STRUCT_TRIPLE},                    3.0},
    [X264_PULLDOWN_EURO]   = {24, {TBT, BT, BT, BT, BT, BT, BT, BT, BT, BT, BT, BT,
                                   BTB, TB, TB, TB, TB, TB, TB, TB, TB, TB, TB, TB}, 25.0/24.0}
};

#undef TB
#undef BT
#undef TBT
#undef BTB

// indexed by pic_struct enum
static const float pulldown_frame_duration[10] = { 0.0, 1, 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 3 };

static void help( x264_param_t *defaults, int longhelp );
static int  parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt );
static int  encode( x264_param_t *param, cli_opt_t *opt );

/* logging and printing for within the cli system */
static int cli_log_level;
void x264_cli_log( const char *name, int i_level, const char *fmt, ... )
{
    if( i_level > cli_log_level )
        return;
    char *s_level;
    switch( i_level )
    {
        case X264_LOG_ERROR:
            s_level = "error";
            break;
        case X264_LOG_WARNING:
            s_level = "warning";
            break;
        case X264_LOG_INFO:
            s_level = "info";
            break;
        case X264_LOG_DEBUG:
            s_level = "debug";
            break;
        default:
            s_level = "unknown";
            break;
    }
    fprintf( stderr, "%s [%s]: ", name, s_level );
    va_list arg;
    va_start( arg, fmt );
    vfprintf( stderr, fmt, arg );
    va_end( arg );
}

void x264_cli_printf( int i_level, const char *fmt, ... )
{
    if( i_level > cli_log_level )
        return;
    va_list arg;
    va_start( arg, fmt );
    vfprintf( stderr, fmt, arg );
    va_end( arg );
}

static void print_version_info()
{
#ifdef X264_POINTVER
    printf( "x264 "X264_POINTVER"\n" );
#else
    printf( "x264 0.%d.X\n", X264_BUILD );
#endif
#if HAVE_SWSCALE
    printf( "(libswscale %d.%d.%d)\n", LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO );
#endif
#if HAVE_LAVF
    printf( "(libavformat %d.%d.%d)\n", LIBAVFORMAT_VERSION_MAJOR, LIBAVFORMAT_VERSION_MINOR, LIBAVFORMAT_VERSION_MICRO );
#endif
#if HAVE_FFMS
    printf( "(ffmpegsource %d.%d.%d.%d)\n", FFMS_VERSION >> 24, (FFMS_VERSION & 0xff0000) >> 16, (FFMS_VERSION & 0xff00) >> 8, FFMS_VERSION & 0xff );
#endif
    printf( "built on " __DATE__ ", " );
#ifdef __INTEL_COMPILER
    printf( "intel: %.2f (%d)\n", __INTEL_COMPILER / 100.f, __INTEL_COMPILER_BUILD_DATE );
#elif defined(__GNUC__)
    printf( "gcc: " __VERSION__ "\n" );
#else
    printf( "using an unknown compiler\n" );
#endif
    printf( "configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, X264_CHROMA_FORMAT ? (output_csp_names[0]+1) : "all" );
    printf( "x264 license: " );
#if HAVE_GPL
    printf( "GPL version 2 or later\n" );
#else
    printf( "Non-GPL commercial\n" );
#endif
#if HAVE_SWSCALE
    const char *license = swscale_license();
    printf( "libswscale%s%s license: %s\n", HAVE_LAVF ? "/libavformat" : "", HAVE_FFMS ? "/ffmpegsource" : "" , license );
    if( !strcmp( license, "nonfree and unredistributable" ) ||
       (!HAVE_GPL && (!strcmp( license, "GPL version 2 or later" )
                  ||  !strcmp( license, "GPL version 3 or later" ))))
        printf( "WARNING: This binary is unredistributable!\n" );
#endif
}

int main( int argc, char **argv )
{
    x264_param_t param;
    cli_opt_t opt = {0};
    int ret = 0;

    FAIL_IF_ERROR( x264_threading_init(), "unable to initialize threading\n" )

#ifdef _WIN32
    _setmode(_fileno(stdin), _O_BINARY);
    _setmode(_fileno(stdout), _O_BINARY);
#endif

    GetConsoleTitle( originalCTitle, sizeof(originalCTitle) );

    /* Parse command line */
    if( parse( argc, argv, &param, &opt ) < 0 )
        ret = -1;

    /* Restore title; it can be changed by input modules */
    SetConsoleTitle( originalCTitle );

    /* Control-C handler */
    signal( SIGINT, sigint_handler );

    if( !ret )
        ret = encode( &param, &opt );

    /* clean up handles */
    if( filter.free )
        filter.free( opt.hin );
    else if( opt.hin )
        cli_input.close_file( opt.hin );
    if( opt.hout )
        cli_output.close_file( opt.hout, 0, 0 );
    if( opt.tcfile_out )
        fclose( opt.tcfile_out );
    if( opt.qpfile )
        fclose( opt.qpfile );

    SetConsoleTitle( originalCTitle );

    return ret;
}

static char const *strtable_lookup( const char * const table[], int idx )
{
    int i = 0; while( table[i] ) i++;
    return ( ( idx >= 0 && idx < i ) ? table[ idx ] : "???" );
}

static char *stringify_names( char *buf, const char * const names[] )
{
    int i = 0;
    char *p = buf;
    for( p[0] = 0; names[i]; i++ )
    {
        p += sprintf( p, "%s", names[i] );
        if( names[i+1] )
            p += sprintf( p, ", " );
    }
    return buf;
}

static void print_csp_names( int longhelp )
{
    if( longhelp < 2 )
        return;
#   define INDENT "                                "
    printf( "                              - valid csps for `raw' demuxer:\n" );
    printf( INDENT );
    for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
    {
        printf( "%s", x264_cli_csps[i].name );
        if( i+1 < X264_CSP_CLI_MAX )
            printf( ", " );
    }
#if HAVE_LAVF
    printf( "\n" );
    printf( "                              - valid csps for `lavf' demuxer:\n" );
    printf( INDENT );
    size_t line_len = strlen( INDENT );
    for( enum PixelFormat i = PIX_FMT_NONE+1; i < PIX_FMT_NB; i++ )
    {
        const char *pfname = av_pix_fmt_descriptors[i].name;
        size_t name_len = strlen( pfname );
        if( line_len + name_len > (80 - strlen( ", " )) )
        {
            printf( "\n" INDENT );
            line_len = strlen( INDENT );
        }
        printf( "%s", pfname );
        line_len += name_len;
        if( i+1 < PIX_FMT_NB )
        {
            printf( ", " );
            line_len += 2;
        }
    }
#endif
    printf( "\n" );
}

static void help( x264_param_t *defaults, int longhelp )
{
    char buf[50];
#define H0 printf
#define H1 if(longhelp>=1) printf
#define H2 if(longhelp==2) printf
    H0( "x264 core:%d%s\n"
        "Syntax: x264 [options] -o outfile infile\n"
        "\n"
        "Infile can be raw (in which case resolution is required),\n"
        "  or YUV4MPEG (*.y4m),\n"
        "  or Avisynth if compiled with support (%s).\n"
        "  or libav* formats if compiled with lavf support (%s) or ffms support (%s).\n"
        "Outfile type is selected by filename:\n"
        " .264 -> Raw bytestream\n"
        " .mkv -> Matroska\n"
        " .flv -> Flash Video\n"
        " .mp4 -> MP4 if compiled with GPAC support (%s)\n"
        "Output bit depth: %d (configured at compile time)\n"
        "\n"
        "Options:\n"
        "\n"
        "  -h, --help                  List basic options\n"
        "      --longhelp              List more options\n"
        "      --fullhelp              List all options\n"
        "\n",
        X264_BUILD, X264_VERSION,
#if HAVE_AVS
        "yes",
#else
        "no",
#endif
#if HAVE_LAVF
        "yes",
#else
        "no",
#endif
#if HAVE_FFMS
        "yes",
#else
        "no",
#endif
#if HAVE_GPAC
        "yes",
#else
        "no",
#endif
        x264_bit_depth
      );
    H0( "Example usage:\n" );
    H0( "\n" );
    H0( "      Constant quality mode:\n" );
    H0( "            x264 --crf 24 -o <output> <input>\n" );
    H0( "\n" );
    H0( "      Two-pass with a bitrate of 1000kbps:\n" );
    H0( "            x264 --pass 1 --bitrate 1000 -o <output> <input>\n" );
    H0( "            x264 --pass 2 --bitrate 1000 -o <output> <input>\n" );
    H0( "\n" );
    H0( "      Lossless:\n" );
    H0( "            x264 --qp 0 -o <output> <input>\n" );
    H0( "\n" );
    H0( "      Maximum PSNR at the cost of speed and visual quality:\n" );
    H0( "            x264 --preset placebo --tune psnr -o <output> <input>\n" );
    H0( "\n" );
    H0( "      Constant bitrate at 1000kbps with a 2 second-buffer:\n");
    H0( "            x264 --vbv-bufsize 2000 --bitrate 1000 -o <output> <input>\n" );
    H0( "\n" );
    H0( "Presets:\n" );
    H0( "\n" );
    H0( "      --profile <string>      Force the limits of an H.264 profile\n"
        "                                  Overrides all settings.\n" );
    H2(
#if X264_CHROMA_FORMAT <= X264_CSP_I420
#if BIT_DEPTH==8
        "                                  - baseline:\n"
        "                                    --no-8x8dct --bframes 0 --no-cabac\n"
        "                                    --cqm flat --weightp 0\n"
        "                                    No interlaced.\n"
        "                                    No lossless.\n"
        "                                  - main:\n"
        "                                    --no-8x8dct --cqm flat\n"
        "                                    No lossless.\n"
        "                                  - high:\n"
        "                                    No lossless.\n"
#endif
        "                                  - high10:\n"
        "                                    No lossless.\n"
        "                                    Support for bit depth 8-10.\n"
#endif
#if X264_CHROMA_FORMAT <= X264_CSP_I422
        "                                  - high422:\n"
        "                                    No lossless.\n"
        "                                    Support for bit depth 8-10.\n"
        "                                    Support for 4:2:0/4:2:2 chroma subsampling.\n"
#endif
        "                                  - high444:\n"
        "                                    Support for bit depth 8-10.\n"
        "                                    Support for 4:2:0/4:2:2/4:4:4 chroma subsampling.\n" );
        else H0(
        "                                  - "
#if X264_CHROMA_FORMAT <= X264_CSP_I420
#if BIT_DEPTH==8
        "baseline,main,high,"
#endif
        "high10,"
#endif
#if X264_CHROMA_FORMAT <= X264_CSP_I422
        "high422,"
#endif
        "high444\n"
               );
    H0( "      --preset <string>       Use a preset to select encoding settings [medium]\n"
        "                                  Overridden by user settings.\n" );
    H2( "                                  - ultrafast:\n"
        "                                    --no-8x8dct --aq-mode 0 --b-adapt 0\n"
        "                                    --bframes 0 --no-cabac --no-deblock\n"
        "                                    --no-mbtree --me dia --no-mixed-refs\n"
        "                                    --partitions none --rc-lookahead 0 --ref 1\n"
        "                                    --scenecut 0 --subme 0 --trellis 0\n"
        "                                    --no-weightb --weightp 0\n"
        "                                  - superfast:\n"
        "                                    --no-mbtree --me dia --no-mixed-refs\n"
        "                                    --partitions i8x8,i4x4 --rc-lookahead 0\n"
        "                                    --ref 1 --subme 1 --trellis 0 --weightp 1\n"
        "                                  - veryfast:\n"
        "                                    --no-mixed-refs --rc-lookahead 10\n"
        "                                    --ref 1 --subme 2 --trellis 0 --weightp 1\n"
        "                                  - faster:\n"
        "                                    --no-mixed-refs --rc-lookahead 20\n"
        "                                    --ref 2 --subme 4 --weightp 1\n"
        "                                  - fast:\n"
        "                                    --rc-lookahead 30 --ref 2 --subme 6\n"
        "                                    --weightp 1\n"
        "                                  - medium:\n"
        "                                    Default settings apply.\n"
        "                                  - slow:\n"
        "                                    --b-adapt 2 --direct auto --me umh\n"
        "                                    --rc-lookahead 50 --ref 5 --subme 8\n"
        "                                  - slower:\n"
        "                                    --b-adapt 2 --direct auto --me umh\n"
        "                                    --partitions all --rc-lookahead 60\n"
        "                                    --ref 8 --subme 9 --trellis 2\n"
        "                                  - veryslow:\n"
        "                                    --b-adapt 2 --bframes 8 --direct auto\n"
        "                                    --me umh --merange 24 --partitions all\n"
        "                                    --ref 16 --subme 10 --trellis 2\n"
        "                                    --rc-lookahead 60\n"
        "                                  - placebo:\n"
        "                                    --bframes 16 --b-adapt 2 --direct auto\n"
        "                                    --slow-firstpass --no-fast-pskip\n"
        "                                    --me tesa --merange 24 --partitions all\n"
        "                                    --rc-lookahead 60 --ref 16 --subme 11\n"
        "                                    --trellis 2\n" );
    else H0( "                                  - ultrafast,superfast,veryfast,faster,fast\n"
             "                                  - medium,slow,slower,veryslow,placebo\n" );
    H0( "      --tune <string>         Tune the settings for a particular type of source\n"
        "                              or situation\n"
        "                                  Overridden by user settings.\n"
        "                                  Multiple tunings are separated by commas.\n"
        "                                  Only one psy tuning can be used at a time.\n" );
    H2( "                                  - film (psy tuning):\n"
        "                                    --deblock -1:-1 --psy-rd <unset>:0.15\n"
        "                                  - animation (psy tuning):\n"
        "                                    --bframes {+2} --deblock 1:1\n"
        "                                    --psy-rd 0.4:<unset> --aq-strength 0.6\n"
        "                                    --ref {Double if >1 else 1}\n"
        "                                  - grain (psy tuning):\n"
        "                                    --aq-strength 0.5 --no-dct-decimate\n"
        "                                    --deadzone-inter 6 --deadzone-intra 6\n"
        "                                    --deblock -2:-2 --ipratio 1.1 \n"
        "                                    --pbratio 1.1 --psy-rd <unset>:0.25\n"
        "                                    --qcomp 0.8\n"
        "                                  - stillimage (psy tuning):\n"
        "                                    --aq-strength 1.2 --deblock -3:-3\n"
        "                                    --psy-rd 2.0:0.7\n"
        "                                  - psnr (psy tuning):\n"
        "                                    --aq-mode 0 --no-psy\n"
        "                                  - ssim (psy tuning):\n"
        "                                    --aq-mode 2 --no-psy\n"
        "                                  - fastdecode:\n"
        "                                    --no-cabac --no-deblock --no-weightb\n"
        "                                    --weightp 0\n"
        "                                  - zerolatency:\n"
        "                                    --bframes 0 --force-cfr --no-mbtree\n"
        "                                    --sync-lookahead 0 --sliced-threads\n"
        "                                    --rc-lookahead 0\n" );
    else H0( "                                  - psy tunings: film,animation,grain,\n"
             "                                                 stillimage,psnr,ssim\n"
             "                                  - other tunings: fastdecode,zerolatency\n" );
    H2( "      --slow-firstpass        Don't force these faster settings with --pass 1:\n"
        "                                  --no-8x8dct --me dia --partitions none\n"
        "                                  --ref 1 --subme {2 if >2 else unchanged}\n"
        "                                  --trellis 0 --fast-pskip\n" );
    else H1( "      --slow-firstpass        Don't force faster settings with --pass 1\n" );
    H0( "\n" );
    H0( "Frame-type options:\n" );
    H0( "\n" );
    H0( "  -I, --keyint <integer or \"infinite\"> Maximum GOP size [%d]\n", defaults->i_keyint_max );
    H2( "  -i, --min-keyint <integer>  Minimum GOP size [auto]\n" );
    H2( "      --no-scenecut           Disable adaptive I-frame decision\n" );
    H2( "      --scenecut <integer>    How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
    H2( "      --intra-refresh         Use Periodic Intra Refresh instead of IDR frames\n" );
    H1( "  -b, --bframes <integer>     Number of B-frames between I and P [%d]\n", defaults->i_bframe );
    H1( "      --b-adapt <integer>     Adaptive B-frame decision method [%d]\n"
        "                                  Higher values may lower threading efficiency.\n"
        "                                  - 0: Disabled\n"
        "                                  - 1: Fast\n"
        "                                  - 2: Optimal (slow with high --bframes)\n", defaults->i_bframe_adaptive );
    H2( "      --b-bias <integer>      Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
    H1( "      --b-pyramid <string>    Keep some B-frames as references [%s]\n"
        "                                  - none: Disabled\n"
        "                                  - strict: Strictly hierarchical pyramid\n"
        "                                  - normal: Non-strict (not Blu-ray compatible)\n",
        strtable_lookup( x264_b_pyramid_names, defaults->i_bframe_pyramid ) );
    H1( "      --open-gop              Use recovery points to close GOPs\n"
        "                              Only available with b-frames\n" );
    H1( "      --no-cabac              Disable CABAC\n" );
    H1( "  -r, --ref <integer>         Number of reference frames [%d]\n", defaults->i_frame_reference );
    H1( "      --no-deblock            Disable loop filter\n" );
    H1( "  -f, --deblock <alpha:beta>  Loop filter parameters [%d:%d]\n",
                                       defaults->i_deblocking_filter_alphac0, defaults->i_deblocking_filter_beta );
    H2( "      --slices <integer>      Number of slices per frame; forces rectangular\n"
        "                              slices and is overridden by other slicing options\n" );
    else H1( "      --slices <integer>      Number of slices per frame\n" );
    H2( "      --slice-max-size <integer> Limit the size of each slice in bytes\n");
    H2( "      --slice-max-mbs <integer> Limit the size of each slice in macroblocks\n");
    H0( "      --tff                   Enable interlaced mode (top field first)\n" );
    H0( "      --bff                   Enable interlaced mode (bottom field first)\n" );
    H2( "      --constrained-intra     Enable constrained intra prediction.\n" );
    H0( "      --pulldown <string>     Use soft pulldown to change frame rate\n"
        "                                  - none, 22, 32, 64, double, triple, euro (requires cfr input)\n" );
    H2( "      --fake-interlaced       Flag stream as interlaced but encode progressive.\n"
        "                              Makes it possible to encode 25p and 30p Blu-Ray\n"
        "                              streams. Ignored in interlaced mode.\n" );
    H2( "      --frame-packing <integer> For stereoscopic videos define frame arrangement\n"
        "                                  - 0: checkerboard - pixels are alternatively from L and R\n"
        "                                  - 1: column alternation - L and R are interlaced by column\n"
        "                                  - 2: row alternation - L and R are interlaced by row\n"
        "                                  - 3: side by side - L is on the left, R on the right\n"
        "                                  - 4: top bottom - L is on top, R on bottom\n"
        "                                  - 5: frame alternation - one view per frame\n" );
    H0( "\n" );
    H0( "Ratecontrol:\n" );
    H0( "\n" );
    H1( "  -q, --qp <integer>          Force constant QP (0-%d, 0=lossless)\n", QP_MAX );
    H0( "  -B, --bitrate <integer>     Set bitrate (kbit/s)\n" );
    H0( "      --crf <float>           Quality-based VBR (%d-51) [%.1f]\n", 51 - QP_MAX_SPEC, defaults->rc.f_rf_constant );
    H1( "      --rc-lookahead <integer> Number of frames for frametype lookahead [%d]\n", defaults->rc.i_lookahead );
    H0( "      --vbv-maxrate <integer> Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate );
    H0( "      --vbv-bufsize <integer> Set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size );
    H2( "      --vbv-init <float>      Initial VBV buffer occupancy [%.1f]\n", defaults->rc.f_vbv_buffer_init );
    H2( "      --crf-max <float>       With CRF+VBV, limit RF to this value\n"
        "                                  May cause VBV underflows!\n" );
    H2( "      --qpmin <integer>       Set min QP [%d]\n", defaults->rc.i_qp_min );
    H2( "      --qpmax <integer>       Set max QP [%d]\n", defaults->rc.i_qp_max );
    H2( "      --qpstep <integer>      Set max QP step [%d]\n", defaults->rc.i_qp_step );
    H2( "      --ratetol <float>       Tolerance of ABR ratecontrol and VBV [%.1f]\n", defaults->rc.f_rate_tolerance );
    H2( "      --ipratio <float>       QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor );
    H2( "      --pbratio <float>       QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor );
    H2( "      --chroma-qp-offset <integer>  QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset );
    H2( "      --aq-mode <integer>     AQ method [%d]\n"
        "                                  - 0: Disabled\n"
        "                                  - 1: Variance AQ (complexity mask)\n"
        "                                  - 2: Auto-variance AQ (experimental)\n", defaults->rc.i_aq_mode );
    H1( "      --aq-strength <float>   Reduces blocking and blurring in flat and\n"
        "                              textured areas. [%.1f]\n", defaults->rc.f_aq_strength );
    H1( "\n" );
    H0( "  -p, --pass <integer>        Enable multipass ratecontrol\n"
        "                                  - 1: First pass, creates stats file\n"
        "                                  - 2: Last pass, does not overwrite stats file\n" );
    H2( "                                  - 3: Nth pass, overwrites stats file\n" );
    H1( "      --stats <string>        Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out );
    H2( "      --no-mbtree             Disable mb-tree ratecontrol.\n");
    H2( "      --qcomp <float>         QP curve compression [%.2f]\n", defaults->rc.f_qcompress );
    H2( "      --cplxblur <float>      Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur );
    H2( "      --qblur <float>         Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur );
    H2( "      --zones <zone0>/<zone1>/...  Tweak the bitrate of regions of the video\n" );
    H2( "                              Each zone is of the form\n"
        "                                  <start frame>,<end frame>,<option>\n"
        "                                  where <option> is either\n"
        "                                      q=<integer> (force QP)\n"
        "                                  or  b=<float> (bitrate multiplier)\n" );
    H2( "      --qpfile <string>       Force frametypes and QPs for some or all frames\n"
        "                              Format of each line: framenumber frametype QP\n"
        "                              QP is optional (none lets x264 choose). Frametypes: I,i,K,P,B,b.\n"
        "                                  K=<I or i> depending on open-gop setting\n"
        "                              QPs are restricted by qpmin/qpmax.\n" );
    H1( "\n" );
    H1( "Analysis:\n" );
    H1( "\n" );
    H1( "  -A, --partitions <string>   Partitions to consider [\"p8x8,b8x8,i8x8,i4x4\"]\n"
        "                                  - p8x8, p4x4, b8x8, i8x8, i4x4\n"
        "                                  - none, all\n"
        "                                  (p4x4 requires p8x8. i8x8 requires --8x8dct.)\n" );
    H1( "      --direct <string>       Direct MV prediction mode [\"%s\"]\n"
        "                                  - none, spatial, temporal, auto\n",
                                       strtable_lookup( x264_direct_pred_names, defaults->analyse.i_direct_mv_pred ) );
    H2( "      --no-weightb            Disable weighted prediction for B-frames\n" );
    H1( "      --weightp <integer>     Weighted prediction for P-frames [%d]\n"
        "                                  - 0: Disabled\n"
        "                                  - 1: Weighted refs\n"
        "                                  - 2: Weighted refs + Duplicates\n", defaults->analyse.i_weighted_pred );
    H1( "      --me <string>           Integer pixel motion estimation method [\"%s\"]\n",
                                       strtable_lookup( x264_motion_est_names, defaults->analyse.i_me_method ) );
    H2( "                                  - dia: diamond search, radius 1 (fast)\n"
        "                                  - hex: hexagonal search, radius 2\n"
        "                                  - umh: uneven multi-hexagon search\n"
        "                                  - esa: exhaustive search\n"
        "                                  - tesa: hadamard exhaustive search (slow)\n" );
    else H1( "                                  - dia, hex, umh\n" );
    H2( "      --merange <integer>     Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
    H2( "      --mvrange <integer>     Maximum motion vector length [-1 (auto)]\n" );
    H2( "      --mvrange-thread <int>  Minimum buffer between threads [-1 (auto)]\n" );
    H1( "  -m, --subme <integer>       Subpixel motion estimation and mode decision [%d]\n", defaults->analyse.i_subpel_refine );
    H2( "                                  - 0: fullpel only (not recommended)\n"
        "                                  - 1: SAD mode decision, one qpel iteration\n"
        "                                  - 2: SATD mode decision\n"
        "                                  - 3-5: Progressively more qpel\n"
        "                                  - 6: RD mode decision for I/P-frames\n"
        "                                  - 7: RD mode decision for all frames\n"
        "                                  - 8: RD refinement for I/P-frames\n"
        "                                  - 9: RD refinement for all frames\n"
        "                                  - 10: QP-RD - requires trellis=2, aq-mode>0\n"
        "                                  - 11: Full RD: disable all early terminations\n" );
    else H1( "                                  decision quality: 1=fast, 11=best\n" );
    H1( "      --psy-rd <float:float>  Strength of psychovisual optimization [\"%.1f:%.1f\"]\n"
        "                                  #1: RD (requires subme>=6)\n"
        "                                  #2: Trellis (requires trellis, experimental)\n",
                                       defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis );
    H2( "      --no-psy                Disable all visual optimizations that worsen\n"
        "                              both PSNR and SSIM.\n" );
    H2( "      --no-mixed-refs         Don't decide references on a per partition basis\n" );
    H2( "      --no-chroma-me          Ignore chroma in motion estimation\n" );
    H1( "      --no-8x8dct             Disable adaptive spatial transform size\n" );
    H1( "  -t, --trellis <integer>     Trellis RD quantization. [%d]\n"
        "                                  - 0: disabled\n"
        "                                  - 1: enabled only on the final encode of a MB\n"
        "                                  - 2: enabled on all mode decisions\n", defaults->analyse.i_trellis );
    H2( "      --no-fast-pskip         Disables early SKIP detection on P-frames\n" );
    H2( "      --no-dct-decimate       Disables coefficient thresholding on P-frames\n" );
    H1( "      --nr <integer>          Noise reduction [%d]\n", defaults->analyse.i_noise_reduction );
    H2( "\n" );
    H2( "      --deadzone-inter <int>  Set the size of the inter luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[0] );
    H2( "      --deadzone-intra <int>  Set the size of the intra luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[1] );
    H2( "                                  Deadzones should be in the range 0 - 32.\n" );
    H2( "      --cqm <string>          Preset quant matrices [\"flat\"]\n"
        "                                  - jvt, flat\n" );
    H1( "      --cqmfile <string>      Read custom quant matrices from a JM-compatible file\n" );
    H2( "                                  Overrides any other --cqm* options.\n" );
    H2( "      --cqm4 <list>           Set all 4x4 quant matrices\n"
        "                                  Takes a comma-separated list of 16 integers.\n" );
    H2( "      --cqm8 <list>           Set all 8x8 quant matrices\n"
        "                                  Takes a comma-separated list of 64 integers.\n" );
    H2( "      --cqm4i, --cqm4p, --cqm8i, --cqm8p <list>\n"
        "                              Set both luma and chroma quant matrices\n" );
    H2( "      --cqm4iy, --cqm4ic, --cqm4py, --cqm4pc <list>\n"
        "                              Set individual quant matrices\n" );
    H2( "\n" );
    H2( "Video Usability Info (Annex E):\n" );
    H2( "The VUI settings are not used by the encoder but are merely suggestions to\n" );
    H2( "the playback equipment. See doc/vui.txt for details. Use at your own risk.\n" );
    H2( "\n" );
    H2( "      --overscan <string>     Specify crop overscan setting [\"%s\"]\n"
        "                                  - undef, show, crop\n",
                                       strtable_lookup( x264_overscan_names, defaults->vui.i_overscan ) );
    H2( "      --videoformat <string>  Specify video format [\"%s\"]\n"
        "                                  - component, pal, ntsc, secam, mac, undef\n",
                                       strtable_lookup( x264_vidformat_names, defaults->vui.i_vidformat ) );
    H2( "      --fullrange <string>    Specify full range samples setting [\"%s\"]\n"
        "                                  - off, on\n",
                                       strtable_lookup( x264_fullrange_names, defaults->vui.b_fullrange ) );
    H2( "      --colorprim <string>    Specify color primaries [\"%s\"]\n"
        "                                  - undef, bt709, bt470m, bt470bg\n"
        "                                    smpte170m, smpte240m, film\n",
                                       strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) );
    H2( "      --transfer <string>     Specify transfer characteristics [\"%s\"]\n"
        "                                  - undef, bt709, bt470m, bt470bg, linear,\n"
        "                                    log100, log316, smpte170m, smpte240m\n",
                                       strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) );
    H2( "      --colormatrix <string>  Specify color matrix setting [\"%s\"]\n"
        "                                  - undef, bt709, fcc, bt470bg\n"
        "                                    smpte170m, smpte240m, GBR, YCgCo\n",
                                       strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) );
    H2( "      --chromaloc <integer>   Specify chroma sample location (0 to 5) [%d]\n",
                                       defaults->vui.i_chroma_loc );

    H2( "      --nal-hrd <string>      Signal HRD information (requires vbv-bufsize)\n"
        "                                  - none, vbr, cbr (cbr not allowed in .mp4)\n" );
    H2( "      --pic-struct            Force pic_struct in Picture Timing SEI\n" );
    H2( "      --crop-rect <string>    Add 'left,top,right,bottom' to the bitstream-level\n"
        "                              cropping rectangle\n" );

    H0( "\n" );
    H0( "Input/Output:\n" );
    H0( "\n" );
    H0( "  -o, --output <string>       Specify output file\n" );
    H1( "      --muxer <string>        Specify output container format [\"%s\"]\n"
        "                                  - %s\n", muxer_names[0], stringify_names( buf, muxer_names ) );
    H1( "      --demuxer <string>      Specify input container format [\"%s\"]\n"
        "                                  - %s\n", demuxer_names[0], stringify_names( buf, demuxer_names ) );
    H1( "      --input-fmt <string>    Specify input file format (requires lavf support)\n" );
    H1( "      --input-csp <string>    Specify input colorspace format for raw input\n" );
    print_csp_names( longhelp );
    H1( "      --output-csp <string>   Specify output colorspace [\"%s\"]\n"
        "                                  - %s\n", output_csp_names[0], stringify_names( buf, output_csp_names ) );
    H1( "      --input-depth <integer> Specify input bit depth for raw input\n" );
    H1( "      --input-res <intxint>   Specify input resolution (width x height)\n" );
    H1( "      --index <string>        Filename for input index file\n" );
    H0( "      --sar width:height      Specify Sample Aspect Ratio\n" );
    H0( "      --fps <float|rational>  Specify framerate\n" );
    H0( "      --seek <integer>        First frame to encode\n" );
    H0( "      --frames <integer>      Maximum number of frames to encode\n" );
    H0( "      --level <string>        Specify level (as defined by Annex A)\n" );
    H1( "      --bluray-compat         Enable compatibility hacks for Blu-ray support\n" );
    H1( "\n" );
    H1( "  -v, --verbose               Print stats for each frame\n" );
    H1( "      --no-progress           Don't show the progress indicator while encoding\n" );
    H0( "      --quiet                 Quiet Mode\n" );
    H1( "      --log-level <string>    Specify the maximum level of logging [\"%s\"]\n"
        "                                  - %s\n", strtable_lookup( log_level_names, cli_log_level - X264_LOG_NONE ),
                                       stringify_names( buf, log_level_names ) );
    H1( "      --psnr                  Enable PSNR computation\n" );
    H1( "      --ssim                  Enable SSIM computation\n" );
    H1( "      --threads <integer>     Force a specific number of threads\n" );
    H2( "      --sliced-threads        Low-latency but lower-efficiency threading\n" );
    H2( "      --thread-input          Run Avisynth in its own thread\n" );
    H2( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
    H2( "      --non-deterministic     Slightly improve quality of SMP, at the cost of repeatability\n" );
    H2( "      --cpu-independent       Ensure exact reproducibility across different cpus,\n"
        "                                  as opposed to letting them select different algorithms\n" );
    H2( "      --asm <integer>         Override CPU detection\n" );
    H2( "      --no-asm                Disable all CPU optimizations\n" );
    H2( "      --visualize             Show MB types overlayed on the encoded video\n" );
    H2( "      --dump-yuv <string>     Save reconstructed frames\n" );
    H2( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
    H2( "      --aud                   Use access unit delimiters\n" );
    H2( "      --force-cfr             Force constant framerate timestamp generation\n" );
    H2( "      --tcfile-in <string>    Force timestamp generation with timecode file\n" );
    H2( "      --tcfile-out <string>   Output timecode v2 file from input timestamps\n" );
    H2( "      --timebase <int/int>    Specify timebase numerator and denominator\n"
        "                 <integer>    Specify timebase numerator for input timecode file\n"
        "                              or specify timebase denominator for other input\n" );
    H2( "      --dts-compress          Eliminate initial delay with container DTS hack\n" );
    H0( "\n" );
    H0( "Filtering:\n" );
    H0( "\n" );
    H0( "      --vf, --video-filter <filter0>/<filter1>/... Apply video filtering to the input file\n" );
    H0( "\n" );
    H0( "      Filter options may be specified in <filter>:<option>=<value> format.\n" );
    H0( "\n" );
    H0( "      Available filters:\n" );
    x264_register_vid_filters();
    x264_vid_filter_help( longhelp );
    H0( "\n" );
}

typedef enum
{
    OPT_FRAMES = 256,
    OPT_SEEK,
    OPT_QPFILE,
    OPT_THREAD_INPUT,
    OPT_QUIET,
    OPT_NOPROGRESS,
    OPT_VISUALIZE,
    OPT_LONGHELP,
    OPT_PROFILE,
    OPT_PRESET,
    OPT_TUNE,
    OPT_SLOWFIRSTPASS,
    OPT_FULLHELP,
    OPT_FPS,
    OPT_MUXER,
    OPT_DEMUXER,
    OPT_INDEX,
    OPT_INTERLACED,
    OPT_TCFILE_IN,
    OPT_TCFILE_OUT,
    OPT_TIMEBASE,
    OPT_PULLDOWN,
    OPT_LOG_LEVEL,
    OPT_VIDEO_FILTER,
    OPT_INPUT_FMT,
    OPT_INPUT_RES,
    OPT_INPUT_CSP,
    OPT_INPUT_DEPTH,
    OPT_DTS_COMPRESSION,
    OPT_OUTPUT_CSP
} OptionsOPT;

static char short_options[] = "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw";
static struct option long_options[] =
{
    { "help",              no_argument, NULL, 'h' },
    { "longhelp",          no_argument, NULL, OPT_LONGHELP },
    { "fullhelp",          no_argument, NULL, OPT_FULLHELP },
    { "version",           no_argument, NULL, 'V' },
    { "profile",     required_argument, NULL, OPT_PROFILE },
    { "preset",      required_argument, NULL, OPT_PRESET },
    { "tune",        required_argument, NULL, OPT_TUNE },
    { "slow-firstpass",    no_argument, NULL, OPT_SLOWFIRSTPASS },
    { "bitrate",     required_argument, NULL, 'B' },
    { "bframes",     required_argument, NULL, 'b' },
    { "b-adapt",     required_argument, NULL, 0 },
    { "no-b-adapt",        no_argument, NULL, 0 },
    { "b-bias",      required_argument, NULL, 0 },
    { "b-pyramid",   required_argument, NULL, 0 },
    { "open-gop",          no_argument, NULL, 0 },
    { "bluray-compat",     no_argument, NULL, 0 },
    { "min-keyint",  required_argument, NULL, 'i' },
    { "keyint",      required_argument, NULL, 'I' },
    { "intra-refresh",     no_argument, NULL, 0 },
    { "scenecut",    required_argument, NULL, 0 },
    { "no-scenecut",       no_argument, NULL, 0 },
    { "nf",                no_argument, NULL, 0 },
    { "no-deblock",        no_argument, NULL, 0 },
    { "filter",      required_argument, NULL, 0 },
    { "deblock",     required_argument, NULL, 'f' },
    { "interlaced",        no_argument, NULL, OPT_INTERLACED },
    { "tff",               no_argument, NULL, OPT_INTERLACED },
    { "bff",               no_argument, NULL, OPT_INTERLACED },
    { "no-interlaced",     no_argument, NULL, OPT_INTERLACED },
    { "constrained-intra", no_argument, NULL, 0 },
    { "cabac",             no_argument, NULL, 0 },
    { "no-cabac",          no_argument, NULL, 0 },
    { "qp",          required_argument, NULL, 'q' },
    { "qpmin",       required_argument, NULL, 0 },
    { "qpmax",       required_argument, NULL, 0 },
    { "qpstep",      required_argument, NULL, 0 },
    { "crf",         required_argument, NULL, 0 },
    { "rc-lookahead",required_argument, NULL, 0 },
    { "ref",         required_argument, NULL, 'r' },
    { "asm",         required_argument, NULL, 0 },
    { "no-asm",            no_argument, NULL, 0 },
    { "sar",         required_argument, NULL, 0 },
    { "fps",         required_argument, NULL, OPT_FPS },
    { "frames",      required_argument, NULL, OPT_FRAMES },
    { "seek",        required_argument, NULL, OPT_SEEK },
    { "output",      required_argument, NULL, 'o' },
    { "muxer",       required_argument, NULL, OPT_MUXER },
    { "demuxer",     required_argument, NULL, OPT_DEMUXER },
    { "stdout",      required_argument, NULL, OPT_MUXER },
    { "stdin",       required_argument, NULL, OPT_DEMUXER },
    { "index",       required_argument, NULL, OPT_INDEX },
    { "analyse",     required_argument, NULL, 0 },
    { "partitions",  required_argument, NULL, 'A' },
    { "direct",      required_argument, NULL, 0 },
    { "weightb",           no_argument, NULL, 'w' },
    { "no-weightb",        no_argument, NULL, 0 },
    { "weightp",     required_argument, NULL, 0 },
    { "me",          required_argument, NULL, 0 },
    { "merange",     required_argument, NULL, 0 },
    { "mvrange",     required_argument, NULL, 0 },
    { "mvrange-thread", required_argument, NULL, 0 },
    { "subme",       required_argument, NULL, 'm' },
    { "psy-rd",      required_argument, NULL, 0 },
    { "no-psy",            no_argument, NULL, 0 },
    { "psy",               no_argument, NULL, 0 },
    { "mixed-refs",        no_argument, NULL, 0 },
    { "no-mixed-refs",     no_argument, NULL, 0 },
    { "no-chroma-me",      no_argument, NULL, 0 },
    { "8x8dct",            no_argument, NULL, '8' },
    { "no-8x8dct",         no_argument, NULL, 0 },
    { "trellis",     required_argument, NULL, 't' },
    { "fast-pskip",        no_argument, NULL, 0 },
    { "no-fast-pskip",     no_argument, NULL, 0 },
    { "no-dct-decimate",   no_argument, NULL, 0 },
    { "aq-strength", required_argument, NULL, 0 },
    { "aq-mode",     required_argument, NULL, 0 },
    { "deadzone-inter", required_argument, NULL, 0 },
    { "deadzone-intra", required_argument, NULL, 0 },
    { "level",       required_argument, NULL, 0 },
    { "ratetol",     required_argument, NULL, 0 },
    { "vbv-maxrate", required_argument, NULL, 0 },
    { "vbv-bufsize", required_argument, NULL, 0 },
    { "vbv-init",    required_argument, NULL, 0 },
    { "crf-max",     required_argument, NULL, 0 },
    { "ipratio",     required_argument, NULL, 0 },
    { "pbratio",     required_argument, NULL, 0 },
    { "chroma-qp-offset", required_argument, NULL, 0 },
    { "pass",        required_argument, NULL, 'p' },
    { "stats",       required_argument, NULL, 0 },
    { "qcomp",       required_argument, NULL, 0 },
    { "mbtree",            no_argument, NULL, 0 },
    { "no-mbtree",         no_argument, NULL, 0 },
    { "qblur",       required_argument, NULL, 0 },
    { "cplxblur",    required_argument, NULL, 0 },
    { "zones",       required_argument, NULL, 0 },
    { "qpfile",      required_argument, NULL, OPT_QPFILE },
    { "threads",     required_argument, NULL, 0 },
    { "sliced-threads",    no_argument, NULL, 0 },
    { "no-sliced-threads", no_argument, NULL, 0 },
    { "slice-max-size",    required_argument, NULL, 0 },
    { "slice-max-mbs",     required_argument, NULL, 0 },
    { "slices",            required_argument, NULL, 0 },
    { "thread-input",      no_argument, NULL, OPT_THREAD_INPUT },
    { "sync-lookahead",    required_argument, NULL, 0 },
    { "non-deterministic", no_argument, NULL, 0 },
    { "cpu-independent",   no_argument, NULL, 0 },
    { "psnr",              no_argument, NULL, 0 },
    { "ssim",              no_argument, NULL, 0 },
    { "quiet",             no_argument, NULL, OPT_QUIET },
    { "verbose",           no_argument, NULL, 'v' },
    { "log-level",   required_argument, NULL, OPT_LOG_LEVEL },
    { "no-progress",       no_argument, NULL, OPT_NOPROGRESS },
    { "visualize",         no_argument, NULL, OPT_VISUALIZE },
    { "dump-yuv",    required_argument, NULL, 0 },
    { "sps-id",      required_argument, NULL, 0 },
    { "aud",               no_argument, NULL, 0 },
    { "nr",          required_argument, NULL, 0 },
    { "cqm",         required_argument, NULL, 0 },
    { "cqmfile",     required_argument, NULL, 0 },
    { "cqm4",        required_argument, NULL, 0 },
    { "cqm4i",       required_argument, NULL, 0 },
    { "cqm4iy",      required_argument, NULL, 0 },
    { "cqm4ic",      required_argument, NULL, 0 },
    { "cqm4p",       required_argument, NULL, 0 },
    { "cqm4py",      required_argument, NULL, 0 },
    { "cqm4pc",      required_argument, NULL, 0 },
    { "cqm8",        required_argument, NULL, 0 },
    { "cqm8i",       required_argument, NULL, 0 },
    { "cqm8p",       required_argument, NULL, 0 },
    { "overscan",    required_argument, NULL, 0 },
    { "videoformat", required_argument, NULL, 0 },
    { "fullrange",   required_argument, NULL, 0 },
    { "colorprim",   required_argument, NULL, 0 },
    { "transfer",    required_argument, NULL, 0 },
    { "colormatrix", required_argument, NULL, 0 },
    { "chromaloc",   required_argument, NULL, 0 },
    { "force-cfr",         no_argument, NULL, 0 },
    { "tcfile-in",   required_argument, NULL, OPT_TCFILE_IN },
    { "tcfile-out",  required_argument, NULL, OPT_TCFILE_OUT },
    { "timebase",    required_argument, NULL, OPT_TIMEBASE },
    { "pic-struct",        no_argument, NULL, 0 },
    { "crop-rect",   required_argument, NULL, 0 },
    { "nal-hrd",     required_argument, NULL, 0 },
    { "pulldown",    required_argument, NULL, OPT_PULLDOWN },
    { "fake-interlaced",   no_argument, NULL, 0 },
    { "frame-packing",     required_argument, NULL, 0 },
    { "vf",          required_argument, NULL, OPT_VIDEO_FILTER },
    { "video-filter", required_argument, NULL, OPT_VIDEO_FILTER },
    { "input-fmt",   required_argument, NULL, OPT_INPUT_FMT },
    { "input-res",   required_argument, NULL, OPT_INPUT_RES },
    { "input-csp",   required_argument, NULL, OPT_INPUT_CSP },
    { "input-depth", required_argument, NULL, OPT_INPUT_DEPTH },
    { "dts-compress",      no_argument, NULL, OPT_DTS_COMPRESSION },
    { "output-csp",  required_argument, NULL, OPT_OUTPUT_CSP },
    {0, 0, 0, 0}
};

static int select_output( const char *muxer, char *filename, x264_param_t *param )
{
    const char *ext = get_filename_extension( filename );
    if( !strcmp( filename, "-" ) || strcasecmp( muxer, "auto" ) )
        ext = muxer;

    if( !strcasecmp( ext, "mp4" ) )
    {
#if HAVE_GPAC
        cli_output = mp4_output;
        param->b_annexb = 0;
        param->b_repeat_headers = 0;
        if( param->i_nal_hrd == X264_NAL_HRD_CBR )
        {
            x264_cli_log( "x264", X264_LOG_WARNING, "cbr nal-hrd is not compatible with mp4\n" );
            param->i_nal_hrd = X264_NAL_HRD_VBR;
        }
#else
        x264_cli_log( "x264", X264_LOG_ERROR, "not compiled with MP4 output support\n" );
        return -1;
#endif
    }
    else if( !strcasecmp( ext, "mkv" ) )
    {
        cli_output = mkv_output;
        param->b_annexb = 0;
        param->b_repeat_headers = 0;
    }
    else if( !strcasecmp( ext, "flv" ) )
    {
        cli_output = flv_output;
        param->b_annexb = 0;
        param->b_repeat_headers = 0;
    }
    else
        cli_output = raw_output;
    return 0;
}

static int select_input( const char *demuxer, char *used_demuxer, char *filename,
                         hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
    int b_auto = !strcasecmp( demuxer, "auto" );
    const char *ext = b_auto ? get_filename_extension( filename ) : "";
    int b_regular = strcmp( filename, "-" );
    if( !b_regular && b_auto )
        ext = "raw";
    b_regular = b_regular && x264_is_regular_file_path( filename );
    if( b_regular )
    {
        FILE *f = fopen( filename, "r" );
        if( f )
        {
            b_regular = x264_is_regular_file( f );
            fclose( f );
        }
    }
    const char *module = b_auto ? ext : demuxer;

    if( !strcasecmp( module, "avs" ) || !strcasecmp( ext, "d2v" ) || !strcasecmp( ext, "dga" ) )
    {
#if HAVE_AVS
        cli_input = avs_input;
        module = "avs";
#else
        x264_cli_log( "x264", X264_LOG_ERROR, "not compiled with AVS input support\n" );
        return -1;
#endif
    }
    else if( !strcasecmp( module, "y4m" ) )
        cli_input = y4m_input;
    else if( !strcasecmp( module, "raw" ) || !strcasecmp( ext, "yuv" ) )
        cli_input = raw_input;
    else
    {
#if HAVE_FFMS
        if( b_regular && (b_auto || !strcasecmp( demuxer, "ffms" )) &&
            !ffms_input.open_file( filename, p_handle, info, opt ) )
        {
            module = "ffms";
            b_auto = 0;
            cli_input = ffms_input;
        }
#endif
#if HAVE_LAVF
        if( (b_auto || !strcasecmp( demuxer, "lavf" )) &&
            !lavf_input.open_file( filename, p_handle, info, opt ) )
        {
            module = "lavf";
            b_auto = 0;
            cli_input = lavf_input;
        }
#endif
#if HAVE_AVS
        if( b_regular && (b_auto || !strcasecmp( demuxer, "avs" )) &&
            !avs_input.open_file( filename, p_handle, info, opt ) )
        {
            module = "avs";
            b_auto = 0;
            cli_input = avs_input;
        }
#endif
        if( b_auto && !raw_input.open_file( filename, p_handle, info, opt ) )
        {
            module = "raw";
            b_auto = 0;
            cli_input = raw_input;
        }

        FAIL_IF_ERROR( !(*p_handle), "could not open input file `%s' via any method!\n", filename )
    }
    strcpy( used_demuxer, module );

    return 0;
}

static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info, x264_param_t *param, int output_csp )
{
    x264_register_vid_filters();

    /* intialize baseline filters */
    if( x264_init_vid_filter( "source", handle, &filter, info, param, NULL ) ) /* wrap demuxer into a filter */
        return -1;
    if( x264_init_vid_filter( "resize", handle, &filter, info, param, "normcsp" ) ) /* normalize csps to be of a known/supported format */
        return -1;
    if( x264_init_vid_filter( "fix_vfr_pts", handle, &filter, info, param, NULL ) ) /* fix vfr pts */
        return -1;

    /* parse filter chain */
    for( char *p = sequence; p && *p; )
    {
        int tok_len = strcspn( p, "/" );
        int p_len = strlen( p );
        p[tok_len] = 0;
        int name_len = strcspn( p, ":" );
        p[name_len] = 0;
        name_len += name_len != tok_len;
        if( x264_init_vid_filter( p, handle, &filter, info, param, p + name_len ) )
            return -1;
        p += X264_MIN( tok_len+1, p_len );
    }

    /* force end result resolution */
    if( !param->i_width && !param->i_height )
    {
        param->i_height = info->height;
        param->i_width  = info->width;
    }
    /* force the output csp to what the user specified (or the default) */
    param->i_csp = info->csp;
    int csp = info->csp & X264_CSP_MASK;
    if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
        param->i_csp = X264_CSP_I420;
    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) )
        param->i_csp = X264_CSP_I422;
    else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
        param->i_csp = X264_CSP_I444;
    else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) )
        param->i_csp = X264_CSP_RGB;
    param->i_csp |= info->csp & X264_CSP_HIGH_DEPTH;

    if( x264_init_vid_filter( "resize", handle, &filter, info, param, NULL ) )
        return -1;

    char args[20];
    sprintf( args, "bit_depth=%d", x264_bit_depth );

    if( x264_init_vid_filter( "depth", handle, &filter, info, param, args ) )
        return -1;

    return 0;
}

static int parse_enum_name( const char *arg, const char * const *names, const char **dst )
{
    for( int i = 0; names[i]; i++ )
        if( !strcasecmp( arg, names[i] ) )
        {
            *dst = names[i];
            return 0;
        }
    return -1;
}

static int parse_enum_value( const char *arg, const char * const *names, int *dst )
{
    for( int i = 0; names[i]; i++ )
        if( !strcasecmp( arg, names[i] ) )
        {
            *dst = i;
            return 0;
        }
    return -1;
}

static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
{
    char *input_filename = NULL;
    const char *demuxer = demuxer_names[0];
    char *output_filename = NULL;
    const char *muxer = muxer_names[0];
    char *tcfile_name = NULL;
    x264_param_t defaults;
    char *profile = NULL;
    char *vid_filters = NULL;
    int b_thread_input = 0;
    int b_turbo = 1;
    int b_user_ref = 0;
    int b_user_fps = 0;
    int b_user_interlaced = 0;
    cli_input_opt_t input_opt;
    cli_output_opt_t output_opt;
    char *preset = NULL;
    char *tune = NULL;

    x264_param_default( &defaults );
    cli_log_level = defaults.i_log_level;

    memset( &input_opt, 0, sizeof(cli_input_opt_t) );
    memset( &output_opt, 0, sizeof(cli_output_opt_t) );
    input_opt.bit_depth = 8;
    int output_csp = defaults.i_csp;
    opt->b_progress = 1;

    /* Presets are applied before all other options. */
    for( optind = 0;; )
    {
        int c = getopt_long( argc, argv, short_options, long_options, NULL );
        if( c == -1 )
            break;
        if( c == OPT_PRESET )
            preset = optarg;
        if( c == OPT_TUNE )
            tune = optarg;
        else if( c == '?' )
            return -1;
    }

    if( preset && !strcasecmp( preset, "placebo" ) )
        b_turbo = 0;

    if( x264_param_default_preset( param, preset, tune ) < 0 )
        return -1;

    /* Parse command line options */
    for( optind = 0;; )
    {
        int b_error = 0;
        int long_options_index = -1;

        int c = getopt_long( argc, argv, short_options, long_options, &long_options_index );

        if( c == -1 )
        {
            break;
        }

        switch( c )
        {
            case 'h':
                help( &defaults, 0 );
                exit(0);
            case OPT_LONGHELP:
                help( &defaults, 1 );
                exit(0);
            case OPT_FULLHELP:
                help( &defaults, 2 );
                exit(0);
            case 'V':
                print_version_info();
                exit(0);
            case OPT_FRAMES:
                param->i_frame_total = X264_MAX( atoi( optarg ), 0 );
                break;
            case OPT_SEEK:
                opt->i_seek = X264_MAX( atoi( optarg ), 0 );
                break;
            case 'o':
                output_filename = optarg;
                break;
            case OPT_MUXER:
                FAIL_IF_ERROR( parse_enum_name( optarg, muxer_names, &muxer ), "Unknown muxer `%s'\n", optarg )
                break;
            case OPT_DEMUXER:
                FAIL_IF_ERROR( parse_enum_name( optarg, demuxer_names, &demuxer ), "Unknown demuxer `%s'\n", optarg )
                break;
            case OPT_INDEX:
                input_opt.index_file = optarg;
                break;
            case OPT_QPFILE:
                opt->qpfile = fopen( optarg, "rb" );
                FAIL_IF_ERROR( !opt->qpfile, "can't open qpfile `%s'\n", optarg )
                if( !x264_is_regular_file( opt->qpfile ) )
                {
                    x264_cli_log( "x264", X264_LOG_ERROR, "qpfile incompatible with non-regular file `%s'\n", optarg );
                    fclose( opt->qpfile );
                    return -1;
                }
                break;
            case OPT_THREAD_INPUT:
                b_thread_input = 1;
                break;
            case OPT_QUIET:
                cli_log_level = param->i_log_level = X264_LOG_NONE;
                break;
            case 'v':
                cli_log_level = param->i_log_level = X264_LOG_DEBUG;
                break;
            case OPT_LOG_LEVEL:
                if( !parse_enum_value( optarg, log_level_names, &cli_log_level ) )
                    cli_log_level += X264_LOG_NONE;
                else
                    cli_log_level = atoi( optarg );
                param->i_log_level = cli_log_level;
                break;
            case OPT_NOPROGRESS:
                opt->b_progress = 0;
                break;
            case OPT_VISUALIZE:
#if HAVE_VISUALIZE
                param->b_visualize = 1;
                b_exit_on_ctrl_c = 1;
#else
                x264_cli_log( "x264", X264_LOG_WARNING, "not compiled with visualization support\n" );
#endif
                break;
            case OPT_TUNE:
            case OPT_PRESET:
                break;
            case OPT_PROFILE:
                profile = optarg;
                break;
            case OPT_SLOWFIRSTPASS:
                b_turbo = 0;
                break;
            case 'r':
                b_user_ref = 1;
                goto generic_option;
            case OPT_FPS:
                b_user_fps = 1;
                param->b_vfr_input = 0;
                goto generic_option;
            case OPT_INTERLACED:
                b_user_interlaced = 1;
                goto generic_option;
            case OPT_TCFILE_IN:
                tcfile_name = optarg;
                break;
            case OPT_TCFILE_OUT:
                opt->tcfile_out = fopen( optarg, "wb" );
                FAIL_IF_ERROR( !opt->tcfile_out, "can't open `%s'\n", optarg )
                break;
            case OPT_TIMEBASE:
                input_opt.timebase = optarg;
                break;
            case OPT_PULLDOWN:
                FAIL_IF_ERROR( parse_enum_value( optarg, pulldown_names, &opt->i_pulldown ), "Unknown pulldown `%s'\n", optarg )
                break;
            case OPT_VIDEO_FILTER:
                vid_filters = optarg;
                break;
            case OPT_INPUT_FMT:
                input_opt.format = optarg;
                break;
            case OPT_INPUT_RES:
                input_opt.resolution = optarg;
                break;
            case OPT_INPUT_CSP:
                input_opt.colorspace = optarg;
                break;
            case OPT_INPUT_DEPTH:
                input_opt.bit_depth = atoi( optarg );
                break;
            case OPT_DTS_COMPRESSION:
                output_opt.use_dts_compress = 1;
                break;
            case OPT_OUTPUT_CSP:
                FAIL_IF_ERROR( parse_enum_value( optarg, output_csp_names, &output_csp ), "Unknown output csp `%s'\n", optarg )
                // correct the parsed value to the libx264 csp value
#if X264_CHROMA_FORMAT
                static const uint8_t output_csp_fix[] = { X264_CHROMA_FORMAT, X264_CSP_RGB };
#else
                static const uint8_t output_csp_fix[] = { X264_CSP_I420, X264_CSP_I422, X264_CSP_I444, X264_CSP_RGB };
#endif
                param->i_csp = output_csp = output_csp_fix[output_csp];
                break;
            default:
generic_option:
            {
                if( long_options_index < 0 )
                {
                    for( int i = 0; long_options[i].name; i++ )
                        if( long_options[i].val == c )
                        {
                            long_options_index = i;
                            break;
                        }
                    if( long_options_index < 0 )
                    {
                        /* getopt_long already printed an error message */
                        return -1;
                    }
                }

                b_error |= x264_param_parse( param, long_options[long_options_index].name, optarg );
            }
        }

        if( b_error )
        {
            const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind-2];
            x264_cli_log( "x264", X264_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg );
            return -1;
        }
    }

    /* If first pass mode is used, apply faster settings. */
    if( b_turbo )
        x264_param_apply_fastfirstpass( param );

    /* Apply profile restrictions. */
    if( x264_param_apply_profile( param, profile ) < 0 )
        return -1;

    /* Get the file name */
    FAIL_IF_ERROR( optind > argc - 1 || !output_filename, "No %s file. Run x264 --help for a list of options.\n",
                   optind > argc - 1 ? "input" : "output" )

    if( select_output( muxer, output_filename, param ) )
        return -1;
    FAIL_IF_ERROR( cli_output.open_file( output_filename, &opt->hout, &output_opt ), "could not open output file `%s'\n", output_filename )

    input_filename = argv[optind++];
    video_info_t info = {0};
    char demuxername[5];

    /* set info flags to param flags to be overwritten by demuxer as necessary. */
    info.csp        = param->i_csp;
    info.fps_num    = param->i_fps_num;
    info.fps_den    = param->i_fps_den;
    info.interlaced = param->b_interlaced;
    info.sar_width  = param->vui.i_sar_width;
    info.sar_height = param->vui.i_sar_height;
    info.tff        = param->b_tff;
    info.vfr        = param->b_vfr_input;

    input_opt.seek = opt->i_seek;
    input_opt.progress = opt->b_progress;
    input_opt.output_csp = output_csp;

    if( select_input( demuxer, demuxername, input_filename, &opt->hin, &info, &input_opt ) )
        return -1;

    FAIL_IF_ERROR( !opt->hin && cli_input.open_file( input_filename, &opt->hin, &info, &input_opt ),
                   "could not open input file `%s'\n", input_filename )

    x264_reduce_fraction( &info.sar_width, &info.sar_height );
    x264_reduce_fraction( &info.fps_num, &info.fps_den );
    x264_cli_log( demuxername, X264_LOG_INFO, "%dx%d%c %d:%d @ %d/%d fps (%cfr)\n", info.width,
                  info.height, info.interlaced ? 'i' : 'p', info.sar_width, info.sar_height,
                  info.fps_num, info.fps_den, info.vfr ? 'v' : 'c' );

    if( tcfile_name )
    {
        FAIL_IF_ERROR( b_user_fps, "--fps + --tcfile-in is incompatible.\n" )
        FAIL_IF_ERROR( timecode_input.open_file( tcfile_name, &opt->hin, &info, &input_opt ), "timecode input failed\n" )
        cli_input = timecode_input;
    }
    else FAIL_IF_ERROR( !info.vfr && input_opt.timebase, "--timebase is incompatible with cfr input\n" )

    /* init threaded input while the information about the input video is unaltered by filtering */
#if HAVE_THREAD
    if( info.thread_safe && (b_thread_input || param->i_threads > 1
        || (param->i_threads == X264_THREADS_AUTO && x264_cpu_num_processors() > 1)) )
    {
        if( thread_input.open_file( NULL, &opt->hin, &info, NULL ) )
        {
            fprintf( stderr, "x264 [error]: threaded input failed\n" );
            return -1;
        }
        cli_input = thread_input;
    }
#endif

    /* override detected values by those specified by the user */
    if( param->vui.i_sar_width && param->vui.i_sar_height )
    {
        info.sar_width  = param->vui.i_sar_width;
        info.sar_height = param->vui.i_sar_height;
    }
    if( b_user_fps )
    {
        info.fps_num = param->i_fps_num;
        info.fps_den = param->i_fps_den;
    }
    if( !info.vfr )
    {
        info.timebase_num = info.fps_den;
        info.timebase_den = info.fps_num;
    }
    if( !tcfile_name && input_opt.timebase )
    {
        uint64_t i_user_timebase_num;
        uint64_t i_user_timebase_den;
        int ret = sscanf( input_opt.timebase, "%"SCNu64"/%"SCNu64, &i_user_timebase_num, &i_user_timebase_den );
        FAIL_IF_ERROR( !ret, "invalid argument: timebase = %s\n", input_opt.timebase )
        else if( ret == 1 )
        {
            i_user_timebase_num = info.timebase_num;
            i_user_timebase_den = strtoul( input_opt.timebase, NULL, 10 );
        }
        FAIL_IF_ERROR( i_user_timebase_num > UINT32_MAX || i_user_timebase_den > UINT32_MAX,
                       "timebase you specified exceeds H.264 maximum\n" )
        opt->timebase_convert_multiplier = ((double)i_user_timebase_den / info.timebase_den)
                                         * ((double)info.timebase_num / i_user_timebase_num);
        info.timebase_num = i_user_timebase_num;
        info.timebase_den = i_user_timebase_den;
        info.vfr = 1;
    }
    if( b_user_interlaced )
    {
        info.interlaced = param->b_interlaced;
        info.tff = param->b_tff;
    }

    if( init_vid_filters( vid_filters, &opt->hin, &info, param, output_csp ) )
        return -1;

    /* set param flags from the post-filtered video */
    param->b_vfr_input = info.vfr;
    param->i_fps_num = info.fps_num;
    param->i_fps_den = info.fps_den;
    param->i_timebase_num = info.timebase_num;
    param->i_timebase_den = info.timebase_den;
    param->vui.i_sar_width  = info.sar_width;
    param->vui.i_sar_height = info.sar_height;

    info.num_frames = X264_MAX( info.num_frames - opt->i_seek, 0 );
    if( (!info.num_frames || param->i_frame_total < info.num_frames)
        && param->i_frame_total > 0 )
        info.num_frames = param->i_frame_total;
    param->i_frame_total = info.num_frames;

    if( !b_user_interlaced && info.interlaced )
    {
#if HAVE_INTERLACED
        x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, enabling %cff interlaced mode.\n"
                      "                If you want otherwise, use --no-interlaced or --%cff\n",
                      info.tff ? 't' : 'b', info.tff ? 'b' : 't' );
        param->b_interlaced = 1;
        param->b_tff = !!info.tff;
#else
        x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, but not compiled with interlaced support\n" );
#endif
    }

    /* Automatically reduce reference frame count to match the user's target level
     * if the user didn't explicitly set a reference frame count. */
    if( !b_user_ref )
    {
        int mbs = (((param->i_width)+15)>>4) * (((param->i_height)+15)>>4);
        for( int i = 0; x264_levels[i].level_idc != 0; i++ )
            if( param->i_level_idc == x264_levels[i].level_idc )
            {
                while( mbs * 384 * param->i_frame_reference > x264_levels[i].dpb &&
                       param->i_frame_reference > 1 )
                {
                    param->i_frame_reference--;
                }
                break;
            }
    }


    return 0;
}

static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
{
    int num = -1, qp, ret;
    char type;
    uint64_t file_pos;
    while( num < i_frame )
    {
        file_pos = ftell( opt->qpfile );
        qp = -1;
        ret = fscanf( opt->qpfile, "%d %c%*[ \t]%d\n", &num, &type, &qp );
        pic->i_type = X264_TYPE_AUTO;
        pic->i_qpplus1 = X264_QP_AUTO;
        if( num > i_frame || ret == EOF )
        {
            fseek( opt->qpfile, file_pos, SEEK_SET );
            break;
        }
        if( num < i_frame && ret >= 2 )
            continue;
        if( ret == 3 && qp >= 0 )
            pic->i_qpplus1 = qp+1;
        if     ( type == 'I' ) pic->i_type = X264_TYPE_IDR;
        else if( type == 'i' ) pic->i_type = X264_TYPE_I;
        else if( type == 'K' ) pic->i_type = X264_TYPE_KEYFRAME;
        else if( type == 'P' ) pic->i_type = X264_TYPE_P;
        else if( type == 'B' ) pic->i_type = X264_TYPE_BREF;
        else if( type == 'b' ) pic->i_type = X264_TYPE_B;
        else ret = 0;
        if( ret < 2 || qp < -1 || qp > QP_MAX )
        {
            x264_cli_log( "x264", X264_LOG_ERROR, "can't parse qpfile for frame %d\n", i_frame );
            fclose( opt->qpfile );
            opt->qpfile = NULL;
            break;
        }
    }
}

static int encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_dts )
{
    x264_picture_t pic_out;
    x264_nal_t *nal;
    int i_nal;
    int i_frame_size = 0;

    i_frame_size = x264_encoder_encode( h, &nal, &i_nal, pic, &pic_out );

    FAIL_IF_ERROR( i_frame_size < 0, "x264_encoder_encode failed\n" );

    if( i_frame_size )
    {
        i_frame_size = cli_output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out );
        *last_dts = pic_out.i_dts;
    }

    return i_frame_size;
}

static int64_t print_status( int64_t i_start, int64_t i_previous, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_ts )
{
    char buf[200];
    int64_t i_time = x264_mdate();
    if( i_previous && i_time - i_previous < UPDATE_INTERVAL )
        return i_previous;
    int64_t i_elapsed = i_time - i_start;
    double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
    double bitrate;
    if( last_ts )
        bitrate = (double) i_file * 8 / ( (double) last_ts * 1000 * param->i_timebase_num / param->i_timebase_den );
    else
        bitrate = (double) i_file * 8 / ( (double) 1000 * param->i_fps_den / param->i_fps_num );
    if( i_frame_total )
    {
        int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
        sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
                 100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate,
                 eta/3600, (eta/60)%60, eta%60 );
    }
    else
    {
        sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate );
    }
    fprintf( stderr, "%s  \r", buf+5 );
    SetConsoleTitle( buf );
    fflush( stderr ); // needed in windows
    return i_time;
}

static void convert_cli_to_lib_pic( x264_picture_t *lib, cli_pic_t *cli )
{
    memcpy( lib->img.i_stride, cli->img.stride, sizeof(cli->img.stride) );
    memcpy( lib->img.plane, cli->img.plane, sizeof(cli->img.plane) );
    lib->img.i_plane = cli->img.planes;
    lib->img.i_csp = cli->img.csp;
    lib->i_pts = cli->pts;
}

#define FAIL_IF_ERROR2( cond, ... )\
if( cond )\
{\
    x264_cli_log( "x264", X264_LOG_ERROR, __VA_ARGS__ );\
    retval = -1;\
    goto fail;\
}

static int encode( x264_param_t *param, cli_opt_t *opt )
{
    x264_t *h = NULL;
    x264_picture_t pic;
    cli_pic_t cli_pic;
    const cli_pulldown_t *pulldown = NULL; // shut up gcc

    int     i_frame = 0;
    int     i_frame_output = 0;
    int64_t i_end, i_previous = 0, i_start = 0;
    int64_t i_file = 0;
    int     i_frame_size;
    int64_t last_dts = 0;
    int64_t prev_dts = 0;
    int64_t first_dts = 0;
#   define  MAX_PTS_WARNING 3 /* arbitrary */
    int     pts_warning_cnt = 0;
    int64_t largest_pts = -1;
    int64_t second_largest_pts = -1;
    int64_t ticks_per_frame;
    double  duration;
    double  pulldown_pts = 0;
    int     retval = 0;

    opt->b_progress &= param->i_log_level < X264_LOG_DEBUG;

    /* set up pulldown */
    if( opt->i_pulldown && !param->b_vfr_input )
    {
        param->b_pulldown = 1;
        param->b_pic_struct = 1;
        pulldown = &pulldown_values[opt->i_pulldown];
        param->i_timebase_num = param->i_fps_den;
        FAIL_IF_ERROR2( fmod( param->i_fps_num * pulldown->fps_factor, 1 ),
                        "unsupported framerate for chosen pulldown\n" )
        param->i_timebase_den = param->i_fps_num * pulldown->fps_factor;
    }

    h = x264_encoder_open( param );
    FAIL_IF_ERROR2( !h, "x264_encoder_open failed\n" );

    x264_encoder_parameters( h, param );

    FAIL_IF_ERROR2( cli_output.set_param( opt->hout, param ), "can't set outfile param\n" );

    i_start = x264_mdate();

    /* ticks/frame = ticks/second / frames/second */
    ticks_per_frame = (int64_t)param->i_timebase_den * param->i_fps_den / param->i_timebase_num / param->i_fps_num;
    FAIL_IF_ERROR2( ticks_per_frame < 1 && !param->b_vfr_input, "ticks_per_frame invalid: %"PRId64"\n", ticks_per_frame )
    ticks_per_frame = X264_MAX( ticks_per_frame, 1 );

    if( !param->b_repeat_headers )
    {
        // Write SPS/PPS/SEI
        x264_nal_t *headers;
        int i_nal;

        FAIL_IF_ERROR2( x264_encoder_headers( h, &headers, &i_nal ) < 0, "x264_encoder_headers failed\n" )
        FAIL_IF_ERROR2( (i_file = cli_output.write_headers( opt->hout, headers )) < 0, "error writing headers to output file\n" );
    }

    if( opt->tcfile_out )
        fprintf( opt->tcfile_out, "# timecode format v2\n" );

    /* Encode frames */
    for( ; !b_ctrl_c && (i_frame < param->i_frame_total || !param->i_frame_total); i_frame++ )
    {
        if( filter.get_frame( opt->hin, &cli_pic, i_frame + opt->i_seek ) )
            break;
        x264_picture_init( &pic );
        convert_cli_to_lib_pic( &pic, &cli_pic );

        if( !param->b_vfr_input )
            pic.i_pts = i_frame;

        if( opt->i_pulldown && !param->b_vfr_input )
        {
            pic.i_pic_struct = pulldown->pattern[ i_frame % pulldown->mod ];
            pic.i_pts = (int64_t)( pulldown_pts + 0.5 );
            pulldown_pts += pulldown_frame_duration[pic.i_pic_struct];
        }
        else if( opt->timebase_convert_multiplier )
            pic.i_pts = (int64_t)( pic.i_pts * opt->timebase_convert_multiplier + 0.5 );

        if( pic.i_pts <= largest_pts )
        {
            if( cli_log_level >= X264_LOG_DEBUG || pts_warning_cnt < MAX_PTS_WARNING )
                x264_cli_log( "x264", X264_LOG_WARNING, "non-strictly-monotonic pts at frame %d (%"PRId64" <= %"PRId64")\n",
                             i_frame, pic.i_pts, largest_pts );
            else if( pts_warning_cnt == MAX_PTS_WARNING )
                x264_cli_log( "x264", X264_LOG_WARNING, "too many nonmonotonic pts warnings, suppressing further ones\n" );
            pts_warning_cnt++;
            pic.i_pts = largest_pts + ticks_per_frame;
        }

        second_largest_pts = largest_pts;
        largest_pts = pic.i_pts;
        if( opt->tcfile_out )
            fprintf( opt->tcfile_out, "%.6f\n", pic.i_pts * ((double)param->i_timebase_num / param->i_timebase_den) * 1e3 );

        if( opt->qpfile )
            parse_qpfile( opt, &pic, i_frame + opt->i_seek );

        prev_dts = last_dts;
        i_frame_size = encode_frame( h, opt->hout, &pic, &last_dts );
        if( i_frame_size < 0 )
        {
            b_ctrl_c = 1; /* lie to exit the loop */
            retval = -1;
        }
        else if( i_frame_size )
        {
            i_file += i_frame_size;
            i_frame_output++;
            if( i_frame_output == 1 )
                first_dts = prev_dts = last_dts;
        }

        if( filter.release_frame( opt->hin, &cli_pic, i_frame + opt->i_seek ) )
            break;

        /* update status line (up to 1000 times per input file) */
        if( opt->b_progress && i_frame_output )
            i_previous = print_status( i_start, i_previous, i_frame_output, param->i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
    }
    /* Flush delayed frames */
    while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
    {
        prev_dts = last_dts;
        i_frame_size = encode_frame( h, opt->hout, NULL, &last_dts );
        if( i_frame_size < 0 )
        {
            b_ctrl_c = 1; /* lie to exit the loop */
            retval = -1;
        }
        else if( i_frame_size )
        {
            i_file += i_frame_size;
            i_frame_output++;
            if( i_frame_output == 1 )
                first_dts = prev_dts = last_dts;
        }
        if( opt->b_progress && i_frame_output )
            i_previous = print_status( i_start, i_previous, i_frame_output, param->i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
    }
fail:
    if( pts_warning_cnt >= MAX_PTS_WARNING && cli_log_level < X264_LOG_DEBUG )
        x264_cli_log( "x264", X264_LOG_WARNING, "%d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );

    /* duration algorithm fails when only 1 frame is output */
    if( i_frame_output == 1 )
        duration = (double)param->i_fps_den / param->i_fps_num;
    else if( b_ctrl_c )
        duration = (double)(2 * last_dts - prev_dts - first_dts) * param->i_timebase_num / param->i_timebase_den;
    else
        duration = (double)(2 * largest_pts - second_largest_pts) * param->i_timebase_num / param->i_timebase_den;

    i_end = x264_mdate();
    /* Erase progress indicator before printing encoding stats. */
    if( opt->b_progress )
        fprintf( stderr, "                                                                               \r" );
    if( h )
        x264_encoder_close( h );
    fprintf( stderr, "\n" );

    if( b_ctrl_c )
        fprintf( stderr, "aborted at input frame %d, output frame %d\n", opt->i_seek + i_frame, i_frame_output );

    cli_output.close_file( opt->hout, largest_pts, second_largest_pts );
    opt->hout = NULL;

    if( i_frame_output > 0 )
    {
        double fps = (double)i_frame_output * (double)1000000 /
                     (double)( i_end - i_start );

        fprintf( stderr, "encoded %d frames, %.2f fps, %.2f kb/s\n", i_frame_output, fps,
                 (double) i_file * 8 / ( 1000 * duration ) );
    }

    return retval;
}
x264-snapshot-20120103-2245-stable/configure0000755000175000001440000010463111700673342017605 0ustar  videolanusers#!/bin/bash

if test x"$1" = x"-h" -o x"$1" = x"--help" ; then
cat <<EOF
Usage: ./configure [options]

Help:
  -h, --help               print this message

Standard options:
  --prefix=PREFIX          install architecture-independent files in PREFIX
                           [/usr/local]
  --exec-prefix=EPREFIX    install architecture-dependent files in EPREFIX
                           [PREFIX]
  --bindir=DIR             install binaries in DIR [EPREFIX/bin]
  --libdir=DIR             install libs in DIR [EPREFIX/lib]
  --includedir=DIR         install includes in DIR [PREFIX/include]
  --extra-asflags=EASFLAGS add EASFLAGS to ASFLAGS
  --extra-cflags=ECFLAGS   add ECFLAGS to CFLAGS
  --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS

Configuration options:
  --disable-cli            disable cli
  --system-libx264         use system libx264 instead of internal
  --enable-shared          build shared library
  --enable-static          build static library
  --disable-gpl            disable GPL-only features
  --disable-thread         disable multithreaded encoding
  --enable-win32thread     use win32threads (windows only)
  --disable-interlaced     disable interlaced encoding support
  --enable-visualize       enable visualization (X11 only)
  --bit-depth=BIT_DEPTH    set output bit depth (8-10) [8]
  --chroma-format=FORMAT   output chroma format (420, 422, 444, all) [all]

Advanced options:
  --disable-asm            disable platform-specific assembly optimizations
  --enable-debug           add -g
  --enable-gprof           add -pg
  --enable-strip           add -s
  --enable-pic             build position-independent code

Cross-compilation:
  --host=HOST              build programs to run on HOST
  --cross-prefix=PREFIX    use PREFIX for compilation tools
  --sysroot=SYSROOT        root of cross-build tree

External library support:
  --disable-avs            disable avisynth support (windows only)
  --disable-swscale        disable swscale support
  --disable-lavf           disable libavformat support
  --disable-ffms           disable ffmpegsource support
  --disable-gpac           disable gpac support

EOF
exit 1
fi

log_check() {
    echo -n "checking $1... " >> config.log
}

log_ok() {
    echo "yes" >> config.log
}

log_fail() {
    echo "no" >> config.log
}

log_msg() {
    echo "$1" >> config.log
}

intel_cflags() {
    # Intel Compiler issues an incredibly large number of warnings on any warning level,
    # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them
    for arg in $*; do
        [ $arg = -ffast-math ] && arg=
        [[ "$arg" = -falign-loops* ]] && arg=
        [ "$arg" = -fno-tree-vectorize ] && arg=
        [ "$arg" = -Wshadow ] && arg=
        if [ $compiler = ICL ]; then
            [ "$arg" = -Wall ] && arg=-W0
            [ "$arg" = -g ] && arg=-Z7
            [ "$arg" = -fomit-frame-pointer ] && arg=
            [ "$arg" = -s ] && arg=
            [ "$arg" = -fPIC ] && arg=
        else
            [ "$arg" = -Wall ] && arg=-w0
        fi

        [ -n "$arg" ] && echo -n "$arg "
    done
}

icl_ldflags() {
    for arg in $*; do
        arg=${arg/LIBPATH/libpath}
        [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib
        [ ${arg#-L} != $arg ] && arg=-libpath:${arg#-L}
        [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware
        [ $arg = -s ] && arg=
        [ "$arg" = -Wl,-Bsymbolic ] && arg=

        arg=${arg/pthreadGC/pthreadVC}
        [ "$arg" = avifil32.lib ] && arg=vfw32.lib
        [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib

        [ -n "$arg" ] && echo -n "$arg "
    done
}

cc_check() {
    if [ -z "$3" ]; then
        if [ -z "$1$2" ]; then
            log_check "whether $CC works"
        elif [ -z "$1" ]; then
            log_check "for $2"
        else
            log_check "for $1"
        fi
    elif [ -z "$1" ]; then
        if [ -z "$2" ]; then
            log_check "whether $CC supports $3"
        else
            log_check "whether $CC supports $3 with $2"
        fi
    else
        log_check "for $3 in $1";
    fi
    rm -f conftest.c
    [ -n "$1" ] && echo "#include <$1>" > conftest.c
    echo "int main () { $3 return 0; }" >> conftest.c
    if [ $compiler = ICL ]; then
        cc_cmd="$CC conftest.c $CFLAGS $2 -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
    else
        cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest"
    fi
    if $cc_cmd >conftest.log 2>&1; then
        res=$?
        log_ok
    else
        res=$?
        log_fail
        log_msg "Failed commandline was:"
        log_msg "--------------------------------------------------"
        log_msg "$cc_cmd"
        cat conftest.log >> config.log
        log_msg "--------------------------------------------------"
        log_msg "Failed program was:"
        log_msg "--------------------------------------------------"
        cat conftest.c >> config.log
        log_msg "--------------------------------------------------"
    fi
    return $res
}

cpp_check() {
    log_check "whether $3 is true"
    rm -f conftest.c
    [ -n "$1" ] && echo "#include <$1>" > conftest.c
    echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c

    if $CC conftest.c $CFLAGS $2 -E -o conftest >conftest.log 2>&1; then
        res=$?
        log_ok
    else
        res=$?
        log_fail
        log_msg "--------------------------------------------------"
        cat conftest.log >> config.log
        log_msg "--------------------------------------------------"
        log_msg "Failed program was:"
        log_msg "--------------------------------------------------"
        cat conftest.c >> config.log
        log_msg "--------------------------------------------------"
    fi
    return $res
}

as_check() {
    log_check "whether $AS supports $1"
    echo "$1" > conftest.asm
    if $AS conftest.asm $ASFLAGS $2 -o conftest.o >conftest.log 2>&1; then
        res=$?
        log_ok
    else
        res=$?
        log_fail
        log_msg "Failed commandline was:"
        log_msg "--------------------------------------------------"
        log_msg "$AS conftest.asm $ASFLAGS $2 -o conftest.o"
        cat conftest.log >> config.log
        log_msg "--------------------------------------------------"
        log_msg "Failed program was:"
        log_msg "--------------------------------------------------"
        cat conftest.asm >> config.log
        log_msg "--------------------------------------------------"
    fi
    return $res
}

define() {
    echo "#define $1$([ -n "$2" ] && echo " $2" || echo " 1")" >> config.h
}

die() {
    log_msg "DIED: $@"
    echo "$@"
    exit 1
}

rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest*

prefix='/usr/local'
exec_prefix='${prefix}'
bindir='${exec_prefix}/bin'
libdir='${exec_prefix}/lib'
includedir='${prefix}/include'
DEVNULL='/dev/null'

cli="yes"
cli_libx264="internal"
shared="no"
static="no"
avs="auto"
lavf="auto"
ffms="auto"
gpac="auto"
gpl="yes"
thread="auto"
swscale="auto"
asm="auto"
interlaced="yes"
debug="no"
gprof="no"
strip="no"
pic="no"
vis="no"
bit_depth="8"
chroma_format="all"
compiler="GNU"

CFLAGS="$CFLAGS -Wall -I."
LDFLAGS="$LDFLAGS"
LDFLAGSCLI="$LDFLAGSCLI"
ASFLAGS="$ASFLAGS"
HAVE_GETOPT_LONG=1
cross_prefix=""

EXE=""

# list of all preprocessor HAVE values we can define
CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT"

# parse options

for opt do
    optarg="${opt#*=}"
    case "$opt" in
        --prefix=*)
            prefix="$optarg"
            ;;
        --exec-prefix=*)
            exec_prefix="$optarg"
            ;;
        --bindir=*)
            bindir="$optarg"
            ;;
        --libdir=*)
            libdir="$optarg"
            ;;
        --includedir=*)
            includedir="$optarg"
            ;;
        --disable-cli)
            cli="no"
            ;;
        --system-libx264)
            cli_libx264="system"
            ;;
        --enable-shared)
            shared="yes"
            ;;
        --enable-static)
            static="yes"
            ;;
        --disable-asm)
            asm="no"
            ;;
        --disable-interlaced)
            interlaced="no"
            ;;
        --disable-avs)
            avs="no"
            ;;
        --disable-lavf)
            lavf="no"
            ;;
        --disable-ffms)
            ffms="no"
            ;;
        --disable-gpac)
            gpac="no"
            ;;
        --disable-gpl)
            gpl="no"
            ;;
        --extra-asflags=*)
            ASFLAGS="$ASFLAGS $optarg"
            ;;
        --extra-cflags=*)
            CFLAGS="$CFLAGS $optarg"
            ;;
        --extra-ldflags=*)
            LDFLAGS="$LDFLAGS $optarg"
            ;;
        --disable-thread)
            thread="no"
            ;;
        --enable-win32thread)
            thread="win32"
            ;;
        --disable-swscale)
            swscale="no"
            ;;
        --enable-debug)
            debug="yes"
            ;;
        --enable-gprof)
            CFLAGS="$CFLAGS -pg"
            LDFLAGS="$LDFLAGS -pg"
            gprof="yes"
            ;;
        --enable-strip)
            strip="yes"
            ;;
        --enable-pic)
            pic="yes"
            ;;
        --enable-visualize)
            vis="yes"
            ;;
        --host=*)
            host="$optarg"
            ;;
        --cross-prefix=*)
            cross_prefix="$optarg"
            ;;
        --sysroot=*)
            CFLAGS="$CFLAGS --sysroot=$optarg"
            LDFLAGS="$LDFLAGS --sysroot=$optarg"
            ;;
        --bit-depth=*)
            bit_depth="$optarg"
            if [ "$bit_depth" -lt "8" -o "$bit_depth" -gt "10" ]; then
                echo "Supplied bit depth must be in range [8,10]."
                exit 1
            fi
            bit_depth=`expr $bit_depth + 0`
            ;;
        --chroma-format=*)
            chroma_format="$optarg"
            if [ $chroma_format != "420" -a $chroma_format != "422" -a $chroma_format != "444" -a $chroma_format != "all" ]; then
                echo "Supplied chroma format must be 420, 422, 444 or all."
                exit 1
            fi
            ;;
        *)
            echo "Unknown option $opt, ignored"
            ;;
    esac
done

[ "$cli" = "no" -a "$shared" = "no" -a "$static" = "no" ] && die "Nothing to build. Enable cli, shared or static."

CC="${CC-${cross_prefix}gcc}"
AR="${AR-${cross_prefix}ar}"
RANLIB="${RANLIB-${cross_prefix}ranlib}"
STRIP="${STRIP-${cross_prefix}strip}"

if [ "x$host" = x ]; then
    host=`./config.guess`
fi
# normalize a triplet into a quadruplet
host=`./config.sub $host`

# split $host
host_cpu="${host%%-*}"
host="${host#*-}"
host_vendor="${host%%-*}"
host_os="${host#*-}"

# test for use of Intel Compiler
if [[ $host_os = mingw* || $host_os = cygwin* ]]; then
    if [[ `basename "$CC"` = icl* ]]; then
        # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths.
        [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS"
        compiler=ICL
        CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -Iextras"
        QPRE="-Q"
        `$CC 2>&1 | grep -q IA-32` && host_cpu=i486
        `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64
        cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer"
    fi
else
    if [[ `basename "$CC"` = icc* ]]; then
        AR="xiar"
        compiler=ICC
        QPRE="-"
    fi
fi

case $host_os in
    beos*)
        SYS="BEOS"
        define HAVE_MALLOC_H
        ;;
    darwin*)
        SYS="MACOSX"
        CFLAGS="$CFLAGS -falign-loops=16"
        LDFLAGS="$LDFLAGS -lm"
        if [ "$pic" = "no" ]; then
            cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic"
        fi
        ;;
    freebsd*)
        SYS="FREEBSD"
        LDFLAGS="$LDFLAGS -lm"
        ;;
    kfreebsd*-gnu)
        SYS="FREEBSD"
        define HAVE_MALLOC_H
        LDFLAGS="$LDFLAGS -lm"
        ;;
    netbsd*)
        SYS="NETBSD"
        LDFLAGS="$LDFLAGS -lm"
        ;;
    openbsd*)
        SYS="OPENBSD"
        LDFLAGS="$LDFLAGS -lm"
        ;;
    *linux*)
        SYS="LINUX"
        define HAVE_MALLOC_H
        LDFLAGS="$LDFLAGS -lm"
        ;;
    gnu*)
        SYS="HURD"
        define HAVE_MALLOC_H
        LDFLAGS="$LDFLAGS -lm"
        ;;
    cygwin*)
        EXE=".exe"
        if cc_check "" -mno-cygwin; then
            CFLAGS="$CFLAGS -mno-cygwin"
            LDFLAGS="$LDFLAGS -mno-cygwin"
        fi
        if cpp_check "" "" "defined(__CYGWIN32__)" ; then
            define HAVE_MALLOC_H
            SYS="CYGWIN"
        else
            SYS="WINDOWS"
            DEVNULL="NUL"
        fi
        ;;
    mingw*)
        SYS="WINDOWS"
        EXE=".exe"
        DEVNULL="NUL"
        ;;
    sunos*|solaris*)
        SYS="SunOS"
        define HAVE_MALLOC_H
        LDFLAGS="$LDFLAGS -lm"
        if cc_check "" /usr/lib/64/values-xpg6.o; then
            LDFLAGS="$LDFLAGS /usr/lib/64/values-xpg6.o"
        else
            LDFLAGS="$LDFLAGS /usr/lib/values-xpg6.o"
        fi
        HAVE_GETOPT_LONG=0
        ;;
    *)
        die "Unknown system $host, edit the configure"
        ;;
esac

case $host_cpu in
    i*86)
        ARCH="X86"
        AS="yasm"
        ASFLAGS="$ASFLAGS -O2"
        if [ $compiler = GNU ]; then
            if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then
                CFLAGS="$CFLAGS -march=i686"
            fi
            if [[ "$asm" == auto && "$CFLAGS" != *-mfpmath* ]]; then
                CFLAGS="$CFLAGS -mfpmath=sse -msse"
            fi
        else
            # icc on linux has various degrees of mod16 stack support
            if [ $SYS = LINUX ]; then
                # < 11 is completely incapable of keeping a mod16 stack
                if cpp_check "" "" "__INTEL_COMPILER < 1100" ; then
                    define BROKEN_STACK_ALIGNMENT
                # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so.
                elif cpp_check "" "" "__INTEL_COMPILER < 1200" ; then
                    CFLAGS="$CFLAGS -falign-stack=assume-16-byte"
                fi
                # >= 12 defaults to a mod16 stack
            fi
            # icl on windows has no mod16 stack support
            [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT
        fi
        if [ "$SYS" = MACOSX ]; then
            ASFLAGS="$ASFLAGS -f macho -DPREFIX"
        elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
            ASFLAGS="$ASFLAGS -f win32 -DPREFIX"
            LDFLAGS="$LDFLAGS -Wl,--large-address-aware"
        else
            ASFLAGS="$ASFLAGS -f elf"
        fi
        ;;
    x86_64)
        ARCH="X86_64"
        AS="yasm"
        if [ "$SYS" = MACOSX ]; then
            ASFLAGS="$ASFLAGS -f macho64 -m amd64 -DPIC -DPREFIX"
            if cc_check '' "-arch x86_64"; then
                CFLAGS="$CFLAGS -arch x86_64"
                LDFLAGS="$LDFLAGS -arch x86_64"
            fi
        elif [ "$SYS" = WINDOWS ]; then
            ASFLAGS="$ASFLAGS -f win32 -m amd64"
            # only the GNU toolchain is inconsistent in prefixing function names with _
            [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX"
        else
            ASFLAGS="$ASFLAGS -f elf -m amd64"
        fi
        ;;
    powerpc|powerpc64)
        ARCH="PPC"
        if [ $asm = auto ] ; then
            define HAVE_ALTIVEC
            AS="${AS-${cross_prefix}gcc}"
            if [ $SYS = MACOSX ] ; then
                CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4"
            else
                CFLAGS="$CFLAGS -maltivec -mabi=altivec"
                define HAVE_ALTIVEC_H
            fi
        fi
        ;;
    sparc)
        ARCH="SPARC"
        case $(uname -m) in
            sun4u|sun4v)
                if [ $asm = auto ]; then
                    ARCH="UltraSPARC"
                    if ! echo $CFLAGS | grep -Eq '\-mcpu' ; then
                        CFLAGS="$CFLAGS -mcpu=ultrasparc"
                        LDFLAGS="$LDFLAGS -mcpu=ultrasparc"
                    fi
                    AS="${AS-${cross_prefix}as}"
                    ASFLAGS="$ASFLAGS -xarch=v8plusa"
                fi
                ;;
        esac
        ;;
    mips|mipsel|mips64|mips64el)
        ARCH="MIPS"
        ;;
    arm*)
        ARCH="ARM"
        if [ "$SYS" = MACOSX ] ; then
            AS="${AS-extras/gas-preprocessor.pl $CC}"
            ASFLAGS="$ASFLAGS -DPREFIX -DPIC"  # apple's ld doesn't support movw/movt relocations at all
            # build for armv7 by default
            if ! echo $CFLAGS | grep -Eq '\-arch' ; then
                CFLAGS="$CFLAGS -arch armv7"
                LDFLAGS="$LDFLAGS -arch armv7"
            fi
        else
            AS="${AS-${cross_prefix}gcc}"
        fi
        ;;
    s390|s390x)
        ARCH="S390"
        ;;
    hppa*|parisc*)
        ARCH="PARISC"
        ;;
    ia64)
        ARCH="IA64"
        ;;
    alpha*)
        ARCH="ALPHA"
        ;;
    *)
        ARCH="$(echo $host_cpu | tr a-z A-Z)"
        ;;
esac

log_msg "x264 configure script"
if [ -n "$*" ]; then
    msg="Command line options:"
    for i in $@; do
        msg="$msg \"$i\""
    done
    log_msg "$msg"
fi
log_msg ""

# check requirements

cc_check || die "No working C compiler found."

if [ $compiler != ICL ]; then
    if cc_check '' -std=gnu99 'for( int i = 0; i < 9; i++ );' ; then
        CFLAGS="$CFLAGS -std=gnu99"
    elif cc_check '' -std=c99 'for( int i = 0; i < 9; i++ );' ; then
        CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE"
    elif ! cc_check '' '' 'for( int i = 0; i < 9; i++ );' ; then
        die "C99 compiler is needed for compilation."
    fi
fi

if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" \) ] ; then
    pic="yes"
fi

if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
    if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then
        VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
        echo "Found $VER"
        echo "Minimum version is yasm-1.0.0"
        echo "If you really want to compile without asm, configure with --disable-asm."
        exit 1
    fi
    if ! cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' ; then
        VER=`(${cross_prefix}as --version || echo no gnu as) 2>/dev/null | head -n 1`
        echo "Found $VER"
        echo "Minimum version is binutils-2.17"
        echo "Your compiler can't handle inline SSSE3 asm."
        echo "If you really want to compile without asm, configure with --disable-asm."
        exit 1
    fi
    define HAVE_MMX
fi

if [ $asm = auto -a $ARCH = ARM ] ; then
    # set flags so neon is built by default
    echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon"

    if  cc_check '' '' '__asm__("rev ip, ip");' ; then      define HAVE_ARMV6
        cc_check '' '' '__asm__("movt r0, #0");'         && define HAVE_ARMV6T2
        cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON
        ASFLAGS="$ASFLAGS $CFLAGS -c"
    else
        echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS."
        echo "If you really want to run on such a CPU, configure with --disable-asm."
        exit 1
    fi
fi

[ $asm = no ] && AS=""
[ "x$AS" = x ] && asm="no" || asm="yes"

define ARCH_$ARCH
define SYS_$SYS

# skip endianness check for Intel Compiler, as all supported platforms are little. the -ipo flag will also cause the check to fail
if [ $compiler = GNU ]; then
    echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c
    $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed"
    if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then
        define WORDS_BIGENDIAN
    elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then
        die "endian test failed"
    fi
fi

# autodetect options that weren't forced nor disabled

# pthread-win32 is lgpl, prevent its use if --disable-gpl is specified and targeting windows
[ "$SYS" = "WINDOWS" -a "$gpl" = "no" -a "$thread" = "auto" ] && thread="win32"

libpthread=""
if [ "$thread" = "auto" ]; then
    thread="no"
    case $SYS in
        BEOS)
            thread="beos"
            define HAVE_BEOSTHREAD
            ;;
        WINDOWS)
            if cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then
                thread="posix"
                libpthread="-lpthread"
            elif cc_check pthread.h -lpthreadGC2 "pthread_create(0,0,0,0);" ; then
                thread="posix"
                libpthread="-lpthreadGC2"
            elif cc_check pthread.h "-lpthreadGC2 -lwsock32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
                thread="posix"
                libpthread="-lpthreadGC2 -lwsock32"
                define PTW32_STATIC_LIB
            elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
                thread="posix"
                libpthread="-lpthreadGC2 -lws2_32"
                define PTW32_STATIC_LIB
            else
                # default to native threading if pthread-win32 is unavailable
                thread="win32"
            fi
            ;;
        OPENBSD)
            cc_check pthread.h -pthread && thread="posix" && libpthread="-pthread"
            ;;
        *)
            cc_check pthread.h -lpthread && thread="posix" && libpthread="-lpthread"
            ;;
    esac
fi
if [ "$thread" = "posix" ]; then
    LDFLAGS="$LDFLAGS $libpthread"
    define HAVE_POSIXTHREAD
    if [ "$SYS" = "LINUX" ] && cc_check sched.h "-D_GNU_SOURCE -Werror" "cpu_set_t p_aff; return CPU_COUNT(&p_aff);" ; then
        define HAVE_CPU_COUNT
    fi
fi
if [ "$thread" = "win32" ]; then
    # cygwin does not support win32 threads
    if [ "$SYS" = "WINDOWS" ]; then
        define HAVE_WIN32THREAD
    else
        thread="no"
    fi
fi
[ "$thread" != "no" ] && define HAVE_THREAD

if cc_check "math.h" "-Werror" "return log2f(2);" ; then
    define HAVE_LOG2F
fi

if [ "$vis" = "yes" ] ; then
    save_CFLAGS="$CFLAGS"
    CFLAGS="$CFLAGS -I/usr/X11R6/include"
    if cc_check "X11/Xlib.h" "-L/usr/X11R6/lib -lX11" "XOpenDisplay(0);" ; then
        LDFLAGS="-L/usr/X11R6/lib -lX11 $LDFLAGS"
        define HAVE_VISUALIZE
    else
        vis="no"
        CFLAGS="$save_CFLAGS"
   fi
fi

if [ "$swscale" = "auto" ] ; then
    swscale="no"
    if ${cross_prefix}pkg-config --exists libswscale 2>/dev/null; then
        SWSCALE_LIBS="$SWSCALE_LIBS $(${cross_prefix}pkg-config --libs libswscale libavutil)"
        SWSCALE_CFLAGS="$SWSCALE_CFLAGS $(${cross_prefix}pkg-config --cflags libswscale libavutil)"
    fi
    [ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil"

    if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_init_context(0,0,0);" ; then
        if cc_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "av_get_pix_fmt_name(0);" ; then
            swscale="yes"
        else
            echo "Warning: av_get_pix_fmt_name is missing from libavutil, update for swscale support"
        fi
    fi
fi

if [ "$lavf" = "auto" ] ; then
    lavf="no"
    if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>/dev/null; then
        LAVF_LIBS="$LAVF_LIBS $(${cross_prefix}pkg-config --libs libavformat libavcodec libavutil libswscale)"
        LAVF_CFLAGS="$LAVF_CFLAGS $(${cross_prefix}pkg-config --cflags libavformat libavcodec libavutil libswscale)"
    fi
    if [ -z "$LAVF_LIBS" -a -z "$LAVF_CFLAGS" ]; then
        LAVF_LIBS="-lavformat"
        for lib in -lpostproc -lavcodec -lavcore -lswscale -lavutil -lm -lz -lbz2 $libpthread -lavifil32; do
            cc_check "" $lib && LAVF_LIBS="$LAVF_LIBS $lib"
        done
    fi
    LAVF_LIBS="-L. $LAVF_LIBS"
    if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "avformat_find_stream_info(0,0); avcodec_open2(0,0,0);" ; then
        if [ "$swscale" = "yes" ]; then
            lavf="yes"
        else
            echo "Warning: libavformat is not supported without swscale support"
        fi
    fi
fi

if [ "$ffms" = "auto" ] ; then
    ffms_major="2"; ffms_minor="14"; ffms_micro="0"; ffms_bump="0"
    ffms="no"

    if ${cross_prefix}pkg-config --exists ffms2 2>/dev/null; then
        FFMS2_LIBS="$FFMS2_LIBS $(${cross_prefix}pkg-config --libs ffms2)"
        FFMS2_CFLAGS="$FFMS2_CFLAGS $(${cross_prefix}pkg-config --cflags ffms2)"
    fi
    [ -z "$FFMS2_LIBS" ] && FFMS2_LIBS="-lffms2"

    if cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS" "FFMS_DestroyVideoSource(0);" ; then
        ffms="yes"
    elif cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS -lstdc++ $LAVF_LIBS" "FFMS_DestroyVideoSource(0);" ; then
        ffms="yes"
        FFMS2_LIBS="$FFMS2_LIBS -lstdc++ $LAVF_LIBS"
    fi

    error="ffms must be at least version $ffms_major.$ffms_minor.$ffms_micro.$ffms_bump"
    if [ $ffms = "yes" ] && ! cpp_check "ffms.h" "$FFMS2_CFLAGS" "FFMS_VERSION >= (($ffms_major << 24) | ($ffms_minor << 16) | ($ffms_micro << 8) | $ffms_bump)" "$error"; then
       ffms="no"
       echo "Warning: $error"
    fi
    if [ "$ffms" = "yes" -a "$swscale" = "no" ]; then
        echo "Warning: ffms is not supported without swscale support"
        ffms="no"
    fi
fi

if [ "$swscale" = "yes" ]; then
    LDFLAGSCLI="$SWSCALE_LIBS $LDFLAGSCLI"
    CFLAGS="$CFLAGS $SWSCALE_CFLAGS"
    define HAVE_SWSCALE
    if [ "$lavf" = "yes" ]; then
        LDFLAGSCLI="$LAVF_LIBS $LDFLAGSCLI"
        CFLAGS="$CFLAGS $LAVF_CFLAGS"
        define HAVE_LAVF
    fi
    if [ "$ffms" = "yes" ]; then
        LDFLAGSCLI="$FFMS2_LIBS $LDFLAGSCLI"
        CFLAGS="$CFLAGS $FFMS2_CFLAGS"
        define HAVE_FFMS
    fi
fi

if [ "$gpac" = "auto" ] ; then
    gpac="no"
    cc_check "" -lz && GPAC_LIBS="-lgpac_static -lz" || GPAC_LIBS="-lgpac_static"
    if [ "$SYS" = "WINDOWS" ] ; then
        GPAC_LIBS="$GPAC_LIBS -lwinmm"
    fi
    if cc_check gpac/isomedia.h "$GPAC_LIBS" ; then
        if cc_check gpac/isomedia.h "$GPAC_LIBS" "gf_isom_set_pixel_aspect_ratio(0,0,0,0,0);" ; then
            gpac="yes"
        else
            echo "Warning: gpac is too old, update to 2007-06-21 UTC or later"
        fi
    fi
fi
if [ "$gpac" = "yes" ] ; then
    define HAVE_GPAC
    if cc_check gpac/isomedia.h "-Werror $GPAC_LIBS" "gf_malloc(1); gf_free(NULL);" ; then
        define HAVE_GF_MALLOC
    fi
    LDFLAGSCLI="$GPAC_LIBS $LDFLAGSCLI"
fi

if [ "$avs" = "auto" ] ; then
    avs="no"
    # cygwin can use avisynth if it can use LoadLibrary
    if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then
        avs="yes"
        define HAVE_AVS
    fi
fi

cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT

if [ "$pic" = "yes" ] ; then
    CFLAGS="$CFLAGS -fPIC"
    ASFLAGS="$ASFLAGS -DPIC"
    # resolve textrels in the x86 asm
    cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic"
    [ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text"
fi

if [ "$debug" != "yes" -a "$gprof" != "yes" ]; then
    CFLAGS="$CFLAGS -fomit-frame-pointer"
fi

if [ "$strip" = "yes" ]; then
    CFLAGS="$CFLAGS -s"
    LDFLAGS="$LDFLAGS -s"
fi

if [ "$debug" = "yes" ]; then
    CFLAGS="-O1 -g $CFLAGS"
elif [ $ARCH = ARM ]; then
    # arm-gcc-4.2 produces incorrect output with -ffast-math
    # and it doesn't save any speed anyway on 4.4, so disable it
    CFLAGS="-O3 -fno-fast-math $CFLAGS"
else
    CFLAGS="-O3 -ffast-math $CFLAGS"
fi

if cc_check '' -fno-tree-vectorize ; then
    CFLAGS="$CFLAGS -fno-tree-vectorize"
fi

if [ $SYS = WINDOWS -a $ARCH = X86 -a $compiler = GNU ] ; then
    # workaround gcc/ld bug with alignment of static variables/arrays that are initialized to zero
    cc_check '' -fno-zero-initialized-in-bss && CFLAGS="$CFLAGS -fno-zero-initialized-in-bss"
fi

if cc_check "stdio.h" "" "fseeko(stdin,0,0);" ; then
    define fseek fseeko
    define ftell ftello
elif cc_check "stdio.h" "" "fseeko64(stdin,0,0);" ; then
    define fseek fseeko64
    define ftell ftello64
elif cc_check "stdio.h" "" "_fseeki64(stdin,0,0);" ; then
    define fseek _fseeki64
    define ftell _ftelli64
fi

if cc_check '' -Wshadow ; then
    CFLAGS="-Wshadow $CFLAGS"
fi

if [ "$bit_depth" -gt "8" ]; then
    define HIGH_BIT_DEPTH
    ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH"
fi

if [ "$chroma_format" != "all" ]; then
    define CHROMA_FORMAT CHROMA_$chroma_format
fi

ASFLAGS="$ASFLAGS -DBIT_DEPTH=$bit_depth"

[ $gpl = yes ] && define HAVE_GPL && x264_gpl=1 || x264_gpl=0

[ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 || x264_interlaced=0

#define undefined vars as 0
for var in $CONFIG_HAVE; do
    grep -q "HAVE_$var 1" config.h || define HAVE_$var 0
done

if [ $compiler = ICL ]; then
    AR="xilib -nologo -out:"
    DEPMM=-QMM
    DEPMT=-QMT
    HAVE_GETOPT_LONG=0
    LD="xilink -out:"
    LDFLAGS="-nologo -incremental:no $(icl_ldflags $LDFLAGS)"
    LDFLAGSCLI="$(icl_ldflags $LDFLAGSCLI)"
    LIBX264=libx264.lib
    RANLIB=
    STRIP=
    if [ $debug = yes ]; then
        LDFLAGS="-debug $LDFLAGS"
        CFLAGS="-D_DEBUG $CFLAGS"
    else
        CFLAGS="-DNDEBUG $CFLAGS"
    fi
else
    AR="$AR rc "
    DEPMM="-MM -g0"
    DEPMT="-MT"
    LD="$CC -o "
    LIBX264=libx264.a
fi
if [ $compiler = GNU ]; then
    PROF_GEN_CC="-fprofile-generate"
    PROF_GEN_LD="-fprofile-generate"
    PROF_USE_CC="-fprofile-use"
    PROF_USE_LD="-fprofile-use"
else
    CFLAGS="$(intel_cflags $CFLAGS)"
    # icc does not define __SSE__ until SSE2 optimization and icl never defines it or _M_IX86_FP
    [ \( $ARCH = X86_64 -o $ARCH = X86 \) -a $asm = yes ] && ! cpp_check "" "" "defined(__SSE__)" && define __SSE__
    PROF_GEN_CC="${QPRE}prof-gen ${QPRE}prof-dir."
    PROF_GEN_LD=
    PROF_USE_CC="${QPRE}prof-use ${QPRE}prof-dir."
    PROF_USE_LD=
fi

rm -f conftest*

# generate exported config file

config_chroma_format="X264_CSP_I$chroma_format"
[ "$config_chroma_format" == "X264_CSP_Iall" ] && config_chroma_format="0"
cat > x264_config.h << EOF
#define X264_BIT_DEPTH     $bit_depth
#define X264_GPL           $x264_gpl
#define X264_INTERLACED    $x264_interlaced
#define X264_CHROMA_FORMAT $config_chroma_format
EOF

# generate config files

cat > config.mak << EOF
prefix=$prefix
exec_prefix=$exec_prefix
bindir=$bindir
libdir=$libdir
includedir=$includedir
ARCH=$ARCH
SYS=$SYS
CC=$CC
CFLAGS=$CFLAGS
DEPMM=$DEPMM
DEPMT=$DEPMT
LD=$LD
LDFLAGS=$LDFLAGS
LIBX264=$LIBX264
AR=$AR
RANLIB=$RANLIB
STRIP=$STRIP
AS=$AS
ASFLAGS=$ASFLAGS
EXE=$EXE
HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG
DEVNULL=$DEVNULL
PROF_GEN_CC=$PROF_GEN_CC
PROF_GEN_LD=$PROF_GEN_LD
PROF_USE_CC=$PROF_USE_CC
PROF_USE_LD=$PROF_USE_LD
EOF

if [ $compiler = ICL ]; then
    echo '%.o: %.c' >> config.mak
    echo '	$(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
fi

if [ "$cli" = "yes" ]; then
    echo 'default: cli' >> config.mak
    echo 'install: install-cli' >> config.mak
fi

if [ "$shared" = "yes" ]; then
    API=$(grep '#define X264_BUILD' < x264.h | cut -f 3 -d ' ')
    if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then
        echo "SONAME=libx264-$API.dll" >> config.mak
        if [ $compiler = ICL ]; then
            echo 'IMPLIBNAME=libx264.dll.lib' >> config.mak
            # GNU ld on windows defaults to exporting all global functions if there are no explicit __declspec(dllexport) declarations
            # MSVC link does not act similarly, so it is required to make an export definition out of x264.h and use it at link time
            echo "SOFLAGS=-dll -def:x264.def -implib:\$(IMPLIBNAME) $SOFLAGS" >> config.mak
            echo "EXPORTS" > x264.def
            grep "^\(int\|void\|x264_t\|extern\).*x264.*[\[(;]" x264.h | sed -e "s/.*\(x264.*\)[\[(].*/\1/;s/.*\(x264.*\);/\1/;s/open/open_$API/g" >> x264.def
        else
            echo 'IMPLIBNAME=libx264.dll.a' >> config.mak
            echo "SOFLAGS=-shared -Wl,--out-implib,\$(IMPLIBNAME) -Wl,--enable-auto-image-base $SOFLAGS" >> config.mak
        fi
    elif [ "$SYS" = "MACOSX" ]; then
        echo "SOSUFFIX=dylib" >> config.mak
        echo "SONAME=libx264.$API.dylib" >> config.mak
        echo "SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name \$(DESTDIR)\$(libdir)/\$(SONAME) $SOFLAGS" >> config.mak
    elif [ "$SYS" = "SunOS" ]; then
        echo "SOSUFFIX=so" >> config.mak
        echo "SONAME=libx264.so.$API" >> config.mak
        echo "SOFLAGS=-shared -Wl,-h,\$(SONAME) $SOFLAGS" >> config.mak
    else
        echo "SOSUFFIX=so" >> config.mak
        echo "SONAME=libx264.so.$API" >> config.mak
        echo "SOFLAGS=-shared -Wl,-soname,\$(SONAME) $SOFLAGS" >> config.mak
    fi
    echo 'default: lib-shared' >> config.mak
    echo 'install: install-lib-shared' >> config.mak
fi

if [ "$static" = "yes" ]; then
    echo 'default: lib-static' >> config.mak
    echo 'install: install-lib-static' >> config.mak
fi

if [ "$cli_libx264" = "system" ] ; then
    if [ "$shared" = "yes" ]; then
        CLI_LIBX264='$(SONAME)'
    elif ${cross_prefix}pkg-config --exists x264 2>/dev/null; then
        LDFLAGSCLI="$LDFLAGSCLI $(${cross_prefix}pkg-config --libs x264)"
        CLI_LIBX264=
    else
        die "Can not find system libx264"
    fi
else
    CLI_LIBX264='$(LIBX264)'
fi
echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak
echo "CLI_LIBX264 = $CLI_LIBX264" >> config.mak

./version.sh >> x264_config.h

pclibs="-L$libdir -lx264 $libpthread"

cat > x264.pc << EOF
prefix=$prefix
exec_prefix=$exec_prefix
libdir=$libdir
includedir=$includedir

Name: x264
Description: H.264 (MPEG4 AVC) encoder library
Version: $(grep POINTVER < x264_config.h | sed -e 's/.* "//; s/".*//')
Libs: $pclibs
Cflags: -I$includedir
EOF

filters="crop select_every"
gpl_filters=""
[ $swscale = yes ] && filters="resize $filters"
[ $gpl = yes ] && filters="$filters $gpl_filters"

cat > conftest.log <<EOF
platform:      $ARCH
system:        $SYS
cli:           $cli
libx264:       $cli_libx264
shared:        $shared
static:        $static
asm:           $asm
interlaced:    $interlaced
avs:           $avs
lavf:          $lavf
ffms:          $ffms
gpac:          $gpac
gpl:           $gpl
thread:        $thread
filters:       $filters
debug:         $debug
gprof:         $gprof
strip:         $strip
PIC:           $pic
visualize:     $vis
bit depth:     $bit_depth
chroma format: $chroma_format
EOF

echo >> config.log
cat conftest.log >> config.log
cat conftest.log
rm conftest.log

echo
echo "You can run 'make' or 'make fprofiled' now."

x264-snapshot-20120103-2245-stable/x264dll.c0000644000175000001440000000356511700673342017245 0ustar  videolanusers/*****************************************************************************
 * x264dll: x264 DLLMain for win32
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: Anton Mitrofanov <BugMaster@narod.ru>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include <windows.h>

/* Callback for our DLL so we can initialize pthread */
BOOL WINAPI DllMain( HANDLE hinstDLL, DWORD fdwReason, LPVOID lpvReserved )
{
#if PTW32_STATIC_LIB
    switch( fdwReason )
    {
        case DLL_PROCESS_ATTACH:
            pthread_win32_process_attach_np();

        case DLL_THREAD_ATTACH:
            pthread_win32_thread_attach_np();
            break;

        case DLL_THREAD_DETACH:
            pthread_win32_thread_detach_np();
            break;

        case DLL_PROCESS_DETACH:
            pthread_win32_thread_detach_np();
            pthread_win32_process_detach_np();
            break;
    }
#endif

    return TRUE;
}
x264-snapshot-20120103-2245-stable/version.sh0000755000175000001440000000126211700673342017716 0ustar  videolanusers#!/bin/bash
git rev-list HEAD | sort > config.git-hash
LOCALVER=`wc -l config.git-hash | awk '{print $1}'`
if [ $LOCALVER \> 1 ] ; then
    VER=`git rev-list origin/master | sort | join config.git-hash - | wc -l | awk '{print $1}'`
    if [ $VER != $LOCALVER ] ; then
        VER="$VER+$(($LOCALVER-$VER))"
    fi
    if git status | grep -q "modified:" ; then
        VER="${VER}M"
    fi
    VER="$VER $(git rev-list HEAD -n 1 | cut -c 1-7)"
    echo "#define X264_VERSION \" r$VER\""
else
    echo "#define X264_VERSION \"\""
    VER="x"
fi
rm -f config.git-hash
API=`grep '#define X264_BUILD' < x264.h | sed -e 's/.* \([1-9][0-9]*\).*/\1/'`
echo "#define X264_POINTVER \"0.$API.$VER\""
x264-snapshot-20120103-2245-stable/tools/0000755000175000001440000000000011700673342017031 5ustar  videolanusersx264-snapshot-20120103-2245-stable/tools/xyuv.c0000644000175000001440000006265111700673342020222 0ustar  videolanusers/*****************************************************************************
 * xyuv.c: a SDL yuv 420 planer viewer.
 *****************************************************************************
 * Copyright (C) 2004 Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *****************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>

#include <SDL/SDL.h>

#define YUV_MAX 20
#define SDL_TITLE "xyuv: %s - %d/%d - %.2ffps"
typedef struct
{
    /* globals */
    int     i_width;
    int     i_height;
    int     i_frame_size;
    int     i_frame;
    int     i_frames;
    float   f_fps;

    float   f_y;

    int     b_pause;
    int     b_grid;
    int     b_split;
    int     b_diff;
    int     i_join;

    /* Constructed picture */
    int     i_wall_width;   /* in picture count */

    /* YUV files */
    int     i_yuv;
    struct
    {
        char    *name;
        FILE    *f;         /* handles */
        int     i_frames;   /* frames count */

        /* Position in the whole picture */
        int     x, y;
    } yuv[YUV_MAX];

    /* SDL */
    int i_sdl_width;
    int i_sdl_height;

    int i_display_width;
    int i_display_height;
    char *title;

    SDL_Surface *screen;
    SDL_Overlay *overlay;

    /* */
    uint8_t *pic;

} xyuv_t;

xyuv_t xyuv = {
    .i_width = 0,
    .i_height = 0,
    .i_frame  = 1,
    .i_frames = 0,
    .f_fps = 25.0,
    .f_y = 0.0,
    .i_wall_width = 0,

    .i_yuv = 0,

    .b_pause = 0,
    .b_split = 0,
    .b_diff = 0,
    .i_join = -1,

    .title = NULL,
    .pic = NULL,
};

static void help( void )
{
    fprintf( stderr,
             "Syntax: xyuv [options] file [file2 ...]\n"
             "\n"
             "      --help                  Print this help\n"
             "\n"
             "  -s, --size <WIDTHxHEIGHT>   Set input size\n"
             "  -w, --width <integer>       Set width\n"
             "  -h, --height <integer>      Set height\n"
             "\n"
             "  -S, --split                 Show splited Y/U/V planes\n"
             "  -d, --diff                  Show difference (only 2 files) in split mode\n"
             "  -j, --joint <integer>\n"
             "\n"
             "  -y <float>                  Set Y factor\n"
             "\n"
             "  -g, --grid                  Show a grid (macroblock 16x16)\n"
             "  -W <integer>                Set wall width (in picture count)\n"
             "  -f, --fps <float>           Set fps\n"
             "\n" );
}

static void xyuv_count_frames( xyuv_t *xyuv );
static void xyuv_detect( int *pi_width, int *pi_height );
static void xyuv_display( xyuv_t *xyuv, int i_frame );

int main( int argc, char **argv )
{
    int i;

    /* Parse commande line */
    for( i = 1; i < argc; i++ ) {
        if( !strcasecmp( argv[i], "--help" ) ) {
            help();
            return 0;
        }
        if( !strcmp( argv[i], "-d" ) || !strcasecmp( argv[i], "--diff" ) ) {
            xyuv.b_diff = 1;
        } else if( !strcmp( argv[i], "-S" ) || !strcasecmp( argv[i], "--split" ) ) {
            xyuv.b_split = 1;
        } else if( !strcmp( argv[i], "-f" ) || !strcasecmp( argv[i], "--fps" ) ) {
            if( i >= argc -1 ) goto err_missing_arg;
            xyuv.f_fps = atof( argv[++i] );
        } else if( !strcmp( argv[i], "-h" ) || !strcasecmp( argv[i], "--height" ) ) {
            if( i >= argc -1 ) goto err_missing_arg;
            xyuv.i_height = atoi( argv[++i] );
        } else if( !strcmp( argv[i], "-w" ) || !strcasecmp( argv[i], "--width" ) ) {
            if( i >= argc -1 ) goto err_missing_arg;
            xyuv.i_width = atoi( argv[++i] );
        } else if( !strcmp( argv[i], "-s" ) || !strcasecmp( argv[i], "--size" ) ) {
            char *p;

            if( i >= argc -1 ) goto err_missing_arg;

            xyuv.i_width = strtol( argv[++i], &p, 0 );
            p++;
            xyuv.i_height = atoi( p );
        } else if( !strcmp( argv[i], "-W" ) ) {
            if( i >= argc -1 ) goto err_missing_arg;
            xyuv.i_wall_width = atoi( argv[++i] );
        } else if( !strcmp( argv[i], "-y" ) ) {
            if( i >= argc -1 ) goto err_missing_arg;
            xyuv.f_y = atof( argv[++i] );
        } else if( !strcmp( argv[i], "-j" ) || !strcasecmp( argv[i], "--join" ) ) {
            if( i >= argc -1 ) goto err_missing_arg;
            xyuv.i_join = atoi( argv[++i] );
        } else if( !strcmp( argv[i], "-g" ) || !strcasecmp( argv[i], "--grid" ) ) {
            xyuv.b_grid = 1;
        } else {
            FILE *f = fopen( argv[i], "rb" );
            if( !f ) {
                fprintf( stderr, "cannot open YUV %s\n", argv[i] );
            } else {
                xyuv.yuv[xyuv.i_yuv].name = strdup( argv[i] );
                xyuv.yuv[xyuv.i_yuv].f = f;
                xyuv.yuv[xyuv.i_yuv].i_frames = 0;

                xyuv.i_yuv++;
            }
        }
    }

    if( xyuv.i_yuv == 0 ) {
        fprintf( stderr, "no file to display\n" );
        return -1;
    }
    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
        char *psz = xyuv.yuv[0].name;
        char *num;
        char *x;
        /* See if we find widthxheight in the file name */
        for( ;; ) {
            if( !( x = strchr( psz+1, 'x' ) ) ) {
                break;
            }
            num = x;
            while( num > psz && num[-1] >= '0' && num[-1] <= '9' )
                num--;

            if( num != x && x[1] >= '0' && x[1] <= '9' ) {
                xyuv.i_width = atoi( num );
                xyuv.i_height = atoi( x+1 );
                break;
            }
            psz = x;
        }
        fprintf( stderr, "file name gives %dx%d\n", xyuv.i_width, xyuv.i_height );
    }

    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
        xyuv_detect( &xyuv.i_width, &xyuv.i_height );
    }

    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
        fprintf( stderr, "invalid or missing frames size\n" );
        return -1;
    }
    if( xyuv.b_diff && xyuv.i_yuv != 2 ) {
        fprintf( stderr, "--diff works only with 2 files\n" );
        return -1;
    }
    if( (xyuv.i_join == 0 || xyuv.i_join >= xyuv.i_width) && xyuv.i_yuv != 2 ) {
        fprintf( stderr, "--join woeks only with two files and range is [1, width-1]\n" );
        return -1;
    }
    if( xyuv.i_join % 2 != 0 ) {
        if( xyuv.i_join + 1 < xyuv.i_width )
            xyuv.i_join++;
        else
            xyuv.i_join--;
    }

    /* Now check frames */
    fprintf( stderr, "displaying :\n" );
    xyuv.i_frame_size = 3 * xyuv.i_width * xyuv.i_height / 2;
    xyuv_count_frames( &xyuv );
    for( i = 0; i < xyuv.i_yuv; i++ ) {
        fprintf( stderr, " - '%s' : %d frames\n", xyuv.yuv[i].name, xyuv.yuv[i].i_frames );
    }

    if( xyuv.i_frames == 0 ) {
        fprintf( stderr, "no frames to display\n" );
    }

    xyuv.pic = malloc( xyuv.i_frame_size );

    /* calculate SDL view */
    if( xyuv.i_wall_width > xyuv.i_yuv ) {
        xyuv.i_wall_width = xyuv.i_yuv;
    }
    if( xyuv.i_wall_width == 0 ) {
        while( xyuv.i_wall_width < xyuv.i_yuv && xyuv.i_wall_width * xyuv.i_wall_width < xyuv.i_yuv ) {
            xyuv.i_wall_width++;
        }
    }

    for( i = 0; i < xyuv.i_yuv; i++ ) {
        if( xyuv.b_diff || xyuv.i_join > 0 ) {
            xyuv.yuv[i].x = 0;
            xyuv.yuv[i].y = 0;
        } else if( xyuv.b_split ) {
            xyuv.yuv[i].x = (i%xyuv.i_wall_width) * 3 * xyuv.i_width / 2;
            xyuv.yuv[i].y = (i/xyuv.i_wall_width) * xyuv.i_height;
        } else {
            xyuv.yuv[i].x = (i%xyuv.i_wall_width) * xyuv.i_width;
            xyuv.yuv[i].y = (i/xyuv.i_wall_width) * xyuv.i_height;
        }
    }
    if( xyuv.b_diff ) {
        xyuv.i_sdl_width = 3 * xyuv.i_width / 2;
        xyuv.i_sdl_height= xyuv.i_height;
    } else if( xyuv.i_join > 0 ) {
        xyuv.i_sdl_width = xyuv.i_width;
        xyuv.i_sdl_height= xyuv.i_height;
    } else if( xyuv.b_split ) {
        xyuv.i_sdl_width = xyuv.i_wall_width * 3 * xyuv.i_width / 2;
        xyuv.i_sdl_height= xyuv.i_height * ( ( xyuv.i_yuv  + xyuv.i_wall_width - 1 ) / xyuv.i_wall_width );
    } else {
        xyuv.i_sdl_width = xyuv.i_wall_width * xyuv.i_width;
        xyuv.i_sdl_height= xyuv.i_height * ( ( xyuv.i_yuv  + xyuv.i_wall_width - 1 ) / xyuv.i_wall_width );
    }
    xyuv.i_display_width = xyuv.i_sdl_width;
    xyuv.i_display_height = xyuv.i_sdl_height;

    /* Open SDL */
    if( SDL_Init( SDL_INIT_EVENTTHREAD|SDL_INIT_NOPARACHUTE|SDL_INIT_VIDEO) ) {
        fprintf( stderr, "cannot init SDL\n" );
        return -1;
    }

    SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, 100 );
    SDL_EventState( SDL_KEYUP, SDL_IGNORE );

    xyuv.screen = SDL_SetVideoMode( xyuv.i_sdl_width, xyuv.i_sdl_height, 0,
                                    SDL_HWSURFACE|SDL_RESIZABLE|
                                    SDL_ASYNCBLIT|SDL_HWACCEL );
    if( xyuv.screen == NULL ) {
        fprintf( stderr, "SDL_SetVideoMode failed\n" );
        return -1;
    }

    SDL_LockSurface( xyuv.screen );
    xyuv.overlay = SDL_CreateYUVOverlay( xyuv.i_sdl_width, xyuv.i_sdl_height,
                                         SDL_YV12_OVERLAY,
                                         xyuv.screen );
    /* reset with black */
    memset( xyuv.overlay->pixels[0],   0, xyuv.overlay->pitches[0] * xyuv.i_sdl_height );
    memset( xyuv.overlay->pixels[1], 128, xyuv.overlay->pitches[1] * xyuv.i_sdl_height / 2);
    memset( xyuv.overlay->pixels[2], 128, xyuv.overlay->pitches[2] * xyuv.i_sdl_height / 2);
    SDL_UnlockSurface( xyuv.screen );

    if( xyuv.overlay == NULL ) {
        fprintf( stderr, "recon: SDL_CreateYUVOverlay failed\n" );
        return -1;
    }

    for( ;; ) {
        SDL_Event event;
        static int b_fullscreen = 0;
        int64_t i_start = SDL_GetTicks();
        int i_wait;

        if( !xyuv.b_pause ) {
            xyuv_display( &xyuv, xyuv.i_frame );
        }

        for( ;; ) {
            int b_refresh = 0;
            while( SDL_PollEvent( &event ) )  {
                switch( event.type )
                {
                    case SDL_QUIT:
                        if( b_fullscreen )
                            SDL_WM_ToggleFullScreen( xyuv.screen );
                        exit( 1 );

                    case SDL_KEYDOWN:
                        switch( event.key.keysym.sym )
                        {
                            case SDLK_q:
                            case SDLK_ESCAPE:
                                if( b_fullscreen )
                                    SDL_WM_ToggleFullScreen( xyuv.screen );
                                exit(1);

                            case SDLK_f:
                                if( SDL_WM_ToggleFullScreen( xyuv.screen ) )
                                    b_fullscreen = 1 - b_fullscreen;
                                break;

                            case SDLK_g:
                                if( xyuv.b_grid )
                                    xyuv.b_grid = 0;
                                else
                                    xyuv.b_grid = 1;
                                if( xyuv.b_pause )
                                    b_refresh = 1;
                                break;

                            case SDLK_SPACE:
                                if( xyuv.b_pause )
                                    xyuv.b_pause = 0;
                                else
                                    xyuv.b_pause = 1;
                                break;
                            case SDLK_LEFT:
                                if( xyuv.i_frame > 1 ) xyuv.i_frame--;
                                b_refresh = 1;
                                break;

                            case SDLK_RIGHT:
                                if( xyuv.i_frame >= xyuv.i_frames )
                                    xyuv_count_frames( &xyuv );
                                if( xyuv.i_frame < xyuv.i_frames ) xyuv.i_frame++;
                                b_refresh = 1;
                                break;

                            case SDLK_HOME:
                                xyuv.i_frame = 1;
                                if( xyuv.b_pause )
                                    b_refresh = 1;
                                break;

                            case SDLK_END:
                                xyuv_count_frames( &xyuv );
                                xyuv.i_frame = xyuv.i_frames;
                                b_refresh = 1;
                                break;

                            case SDLK_UP:
                                xyuv.i_frame += xyuv.i_frames / 20;

                                if( xyuv.i_frame >= xyuv.i_frames )
                                    xyuv_count_frames( &xyuv );

                                if( xyuv.i_frame > xyuv.i_frames )
                                    xyuv.i_frame = xyuv.i_frames;
                                b_refresh = 1;
                                break;

                            case SDLK_DOWN:
                                xyuv.i_frame -= xyuv.i_frames / 20;
                                if( xyuv.i_frame < 1 )
                                    xyuv.i_frame = 1;
                                b_refresh = 1;
                                break;

                            case SDLK_PAGEUP:
                                xyuv.i_frame += xyuv.i_frames / 10;

                                if( xyuv.i_frame >= xyuv.i_frames )
                                    xyuv_count_frames( &xyuv );

                                if( xyuv.i_frame > xyuv.i_frames )
                                    xyuv.i_frame = xyuv.i_frames;
                                b_refresh = 1;
                                break;

                            case SDLK_PAGEDOWN:
                                xyuv.i_frame -= xyuv.i_frames / 10;
                                if( xyuv.i_frame < 1 )
                                    xyuv.i_frame = 1;
                                b_refresh = 1;
                                break;

                            default:
                                break;
                        }
                        break;
                    case SDL_VIDEORESIZE:
                        xyuv.i_display_width = event.resize.w;
                        xyuv.i_display_height = event.resize.h;
                        xyuv.screen = SDL_SetVideoMode( xyuv.i_display_width, xyuv.i_display_height, 0,
                                                        SDL_HWSURFACE|SDL_RESIZABLE|
                                                        SDL_ASYNCBLIT|SDL_HWACCEL );
                        xyuv_display( &xyuv, xyuv.i_frame );
                        break;

                    default:
                        break;
                }
            }
            if( b_refresh ) {
                xyuv.b_pause = 1;
                xyuv_display( &xyuv, xyuv.i_frame );
            }
            /* wait */
            i_wait = 1000 / xyuv.f_fps - ( SDL_GetTicks() - i_start);
            if( i_wait < 0 )
                break;
            else if( i_wait > 200 )
                SDL_Delay( 200 );
            else {
                SDL_Delay( i_wait );
                break;
            }
        }
        if( !xyuv.b_pause ) {
            /* next frame */
            if( xyuv.i_frame == xyuv.i_frames )
                    xyuv.b_pause = 1;
            else if( xyuv.i_frame < xyuv.i_frames )
                xyuv.i_frame++;
        }
    }


    return 0;

err_missing_arg:
    fprintf( stderr, "missing arg for option=%s\n", argv[i] );
    return -1;
}


static void xyuv_display( xyuv_t *xyuv, int i_frame )
{
    SDL_Rect rect;
    int i_picture = 0;
    int i;

    if( i_frame > xyuv->i_frames )
        return;

    xyuv->i_frame = i_frame;

    /* Load and copy pictue data */
    for( i = 0; i < xyuv->i_yuv; i++ ) {
        int i_plane;

        fprintf( stderr, "yuv[%d] %d/%d\n", i, i_frame, xyuv->yuv[i].i_frames );
        if( i_frame - 1 >= xyuv->yuv[i].i_frames ) {
            xyuv_count_frames( xyuv );
            if( i_frame - 1 >= xyuv->yuv[i].i_frames )
                continue;
        }
        i_picture++;

        fseek( xyuv->yuv[i].f, (xyuv->i_frame-1) * xyuv->i_frame_size, SEEK_SET );
        fread( xyuv->pic, xyuv->i_frame_size, 1, xyuv->yuv[i].f );

        SDL_LockYUVOverlay( xyuv->overlay );

        if( xyuv->b_diff || xyuv->b_split ) {
            /* Reset UV */
            for( i_plane = 1; i_plane < 3; i_plane++ ) {
                memset( xyuv->overlay->pixels[i_plane], 128, xyuv->overlay->pitches[i_plane] * xyuv->overlay->h / 2 );
            }
            /* Show diff in Y plane of overlay */

            for( i_plane = 0; i_plane < 3; i_plane++ ) {
                int div = i_plane == 0 ? 1 : 2;
                uint8_t *src = xyuv->pic;
                uint8_t *dst = xyuv->overlay->pixels[0] +
                                (xyuv->yuv[i].x + xyuv->yuv[i].y * xyuv->overlay->pitches[0] );
                int j;
                if( i_plane == 1 ) {
                    src +=  5*xyuv->i_width * xyuv->i_height/4;
                    dst += xyuv->i_width;
                } else if( i_plane == 2 ) {
                    src += xyuv->i_width * xyuv->i_height;
                    dst += xyuv->i_width + xyuv->i_height / 2 * xyuv->overlay->pitches[0];
                }

                for( j = 0; j < xyuv->i_height / div; j++ ) {
                    if( i_picture == 1 || xyuv->b_split ) {
                        memcpy( dst, src, xyuv->i_width / div );
                    } else {
                        int k;
                        for( k = 0; k < xyuv->i_width / div; k++ ) {
                            dst[k] = abs( dst[k] - src[k]);
                        }
                    }
                    src += xyuv->i_width / div;
                    dst += xyuv->overlay->pitches[0];
                }
            }
        } else {
            for( i_plane = 0; i_plane < 3; i_plane++ ) {
                int div = i_plane == 0 ? 1 : 2;
                uint8_t *src = xyuv->pic;
                uint8_t *dst = xyuv->overlay->pixels[i_plane] +
                                ((xyuv->yuv[i].x + xyuv->yuv[i].y * xyuv->overlay->pitches[i_plane] ) / div );
                int w = xyuv->i_width / div;
                int j;

                if( i_plane == 1 ) {
                    src +=  5*xyuv->i_width * xyuv->i_height/4;
                } else if( i_plane == 2 ) {
                    src += xyuv->i_width * xyuv->i_height;
                }
                if( xyuv->i_join > 0 ) {
                    if( i_picture > 1 ) {
                        src += xyuv->i_join / div;
                        dst += xyuv->i_join / div;
                        w = (xyuv->i_width - xyuv->i_join) /div;
                    } else {
                        w = xyuv->i_join / div;
                    }
                }

                for( j = 0; j < xyuv->i_height / div; j++ ) {
                    memcpy( dst, src, w );
                    src += xyuv->i_width / div;
                    dst += xyuv->overlay->pitches[i_plane];
                }
            }
        }

        SDL_UnlockYUVOverlay( xyuv->overlay );
    }

    if( xyuv->f_y != 0.0 ) {
        uint8_t *pix = xyuv->overlay->pixels[0];
        int j;

        for( j = 0; j < xyuv->i_sdl_height; j++ ) {
            int k;
            for( k = 0; k < xyuv->i_sdl_width; k++ ) {
                int v= pix[k] * xyuv->f_y;
                if( v > 255 )
                    pix[k] = 255;
                else if( v < 0 )
                    pix[k] = 0;
                else
                    pix[k] = v;
            }
            pix += xyuv->overlay->pitches[0];
        }
    }
    if( xyuv->b_grid ) {
        int x, y;

        for( y = 0; y < xyuv->i_sdl_height; y += 4 ) {
            uint8_t *p = xyuv->overlay->pixels[0] + y * xyuv->overlay->pitches[0];
            for( x = 0; x < xyuv->i_sdl_width; x += 4 ) {
                if( x%16== 0 || y%16 == 0 )
                    p[x] = 0;
            }
        }
    }

    /* Update display */
    rect.x = 0;
    rect.y = 0;
    rect.w = xyuv->i_display_width;
    rect.h = xyuv->i_display_height;
    SDL_DisplayYUVOverlay( xyuv->overlay, &rect );

    /* Display title */
    if( xyuv->title )
        free( xyuv->title );
    asprintf( &xyuv->title, SDL_TITLE, xyuv->yuv[0].name, xyuv->i_frame, xyuv->i_frames, xyuv->f_fps );
    SDL_WM_SetCaption( xyuv->title, "" );
}

static void xyuv_count_frames( xyuv_t *xyuv )
{
    int i;

    xyuv->i_frames = 0;
    if( xyuv->i_frame_size <= 0 )
        return;

    for( i = 0; i < xyuv->i_yuv; i++ ) {
        /* Beurk but avoid using fstat */
        fseek( xyuv->yuv[i].f, 0, SEEK_END );

        xyuv->yuv[i].i_frames = ftell( xyuv->yuv[i].f ) / xyuv->i_frame_size;
        fprintf( stderr, "count (%d) -> %d\n", i, xyuv->yuv[i].i_frames );

        fseek( xyuv->yuv[i].f, 0, SEEK_SET );

        if( xyuv->i_frames < xyuv->yuv[i].i_frames )
            xyuv->i_frames = xyuv->yuv[i].i_frames;
    }
}

static inline int ssd( int a ) { return a*a; }

static void xyuv_detect( int *pi_width, int *pi_height )
{
    static const int pi_size[][2] = {
        {128, 96},
        {160,120},
        {320,244},
        {320,288},

        /* PAL */
        {176,144},  // QCIF
        {352,288},  // CIF
        {352,576},  // 1/2 D1
        {480,576},  // 2/3 D1
        {544,576},
        {640,576},  // VGA
        {704,576},  // D1
        {720,576},  // D1

        /* NTSC */
        {176,112},  // QCIF
        {320,240},  // MPEG I
        {352,240},  // CIF
        {352,480},  // 1/2 D1
        {480,480},  // 2/3 D1
        {544,480},
        {640,480},  // VGA
        {704,480},  // D1
        {720,480},  // D1

        /* */
        {0,0},
    };
    int i_max;
    int i_size_max;
    uint8_t *pic;
    int i;

    *pi_width = 0;
    *pi_height = 0;

    /* Compute size max */
    for( i_max = 0, i_size_max = 0;
            pi_size[i_max][0] != 0 && pi_size[i_max][1] != 0; i_max++ ) {
        int s = pi_size[i_max][0] * pi_size[i_max][1] * 3 / 2;

        if( i_size_max < s )
            i_size_max = s;
    }

    /* Temporary buffer */
    i_size_max *= 3;
    pic = malloc( i_size_max );

    fprintf( stderr, "guessing size for:\n" );
    for( i = 0; i < xyuv.i_yuv; i++ ) {
        int j;
        int i_read;
        double dbest = 255*255;
        int    i_best = i_max;
        int64_t t;

        fprintf( stderr, " - %s\n", xyuv.yuv[i].name );

        i_read = fread( pic, 1, i_size_max, xyuv.yuv[i].f );
        if( i_read < 0 )
            continue;

        /* Check if file size is at least compatible with one format
         * (if not, ignore file size)*/
        fseek( xyuv.yuv[i].f, 0, SEEK_END );
        t = ftell( xyuv.yuv[i].f );
        fseek( xyuv.yuv[i].f, 0, SEEK_SET );
        for( j = 0; j < i_max; j++ ) {
            const int w = pi_size[j][0];
            const int h = pi_size[j][1];
            const int s = w * h * 3 / 2;

            if( t % s == 0 )
                break;
        }
        if( j == i_max )
            t = 0;


        /* Try all size */
        for( j = 0; j < i_max; j++ ) {
            const int w = pi_size[j][0];
            const int h = pi_size[j][1];
            const int s = w * h * 3 / 2;
            double dd;

            int x, y, n;
            int64_t d;

            /* To small */
            if( i_read < 3*s )
                continue;
            /* Check file size */
            if( ( t > 0 && (t % s) != 0  ) ) {
                fprintf( stderr, "  * %dx%d ignored (incompatible file size)\n", w, h );
                continue;
            }


            /* We do a simple ssd between 2 consecutives lines */
            d = 0;
            for( n = 0; n < 3; n++ ) {
                uint8_t *p;

                /* Y */
                p = &pic[n*s];
                for( y = 0; y < h-1; y++ ) {
                    for( x = 0; x < w; x++ )
                        d += ssd( p[x] - p[w+x] );
                    p += w;
                }

                /* U */
                p = &pic[n*s+w*h];
                for( y = 0; y < h/2-1; y++ ) {
                    for( x = 0; x < w/2; x++ )
                        d += ssd( p[x] - p[(w/2)+x] );
                    p += w/2;
                }

                /* V */
                p = &pic[n*s+5*w*h/4];
                for( y = 0; y < h/2-1; y++ ) {
                    for( x = 0; x < w/2; x++ )
                        d += ssd( p[x] - p[(w/2)+x] );
                    p += w/2;
                }
            }
            dd = (double)d / (3*w*h*3/2);
            fprintf( stderr, "  * %dx%d d=%f\n", w, h, dd );

            if( dd < dbest ) {
                i_best = j;
                dbest = dd;
            }
        }

        fseek( xyuv.yuv[i].f, 0, SEEK_SET );

        if( i_best < i_max ) {
            fprintf( stderr, "  -> %dx%d\n", pi_size[i_best][0], pi_size[i_best][1] );
            *pi_width = pi_size[i_best][0];
            *pi_height = pi_size[i_best][1];
        }
    }

    free( pic );
}
x264-snapshot-20120103-2245-stable/tools/test_x264.py0000755000175000001440000003731511700673342021161 0ustar  videolanusers#!/usr/bin/env python

import operator

from optparse import OptionGroup

import sys

from time import time

from digress.cli import Dispatcher as _Dispatcher
from digress.errors import ComparisonError, FailedTestError, DisabledTestError
from digress.testing import depends, comparer, Fixture, Case
from digress.comparers import compare_pass
from digress.scm import git as x264git

from subprocess import Popen, PIPE, STDOUT

import os
import re
import shlex
import inspect

from random import randrange, seed
from math import ceil

from itertools import imap, izip

os.chdir(os.path.join(os.path.dirname(__file__), ".."))

# options

OPTIONS = [
    [ "--tune %s" % t for t in ("film", "zerolatency") ],
    ("", "--intra-refresh"),
    ("", "--no-cabac"),
    ("", "--interlaced"),
    ("", "--slice-max-size 1000"),
    ("", "--frame-packing 5"),
    [ "--preset %s" % p for p in ("ultrafast",
                                  "superfast",
                                  "veryfast",
                                  "faster",
                                  "fast",
                                  "medium",
                                  "slow",
                                  "slower",
                                  "veryslow",
                                  "placebo") ]
]

# end options

def compare_yuv_output(width, height):
    def _compare_yuv_output(file_a, file_b):
        size_a = os.path.getsize(file_a)
        size_b = os.path.getsize(file_b)

        if size_a != size_b:
            raise ComparisonError("%s is not the same size as %s" % (
                file_a,
                file_b
            ))

        BUFFER_SIZE = 8196

        offset = 0

        with open(file_a) as f_a:
            with open(file_b) as f_b:
                for chunk_a, chunk_b in izip(
                    imap(
                        lambda i: f_a.read(BUFFER_SIZE),
                        xrange(size_a // BUFFER_SIZE + 1)
                    ),
                    imap(
                        lambda i: f_b.read(BUFFER_SIZE),
                        xrange(size_b // BUFFER_SIZE + 1)
                    )
                ):
                    chunk_size = len(chunk_a)

                    if chunk_a != chunk_b:
                        for i in xrange(chunk_size):
                            if chunk_a[i] != chunk_b[i]:
                                # calculate the macroblock, plane and frame from the offset
                                offs = offset + i

                                y_plane_area = width * height
                                u_plane_area = y_plane_area + y_plane_area * 0.25
                                v_plane_area = u_plane_area + y_plane_area * 0.25

                                pixel = offs % v_plane_area
                                frame = offs // v_plane_area

                                if pixel < y_plane_area:
                                    plane = "Y"

                                    pixel_x = pixel % width
                                    pixel_y = pixel // width

                                    macroblock = (ceil(pixel_x / 16.0), ceil(pixel_y / 16.0))
                                elif pixel < u_plane_area:
                                    plane = "U"

                                    pixel -= y_plane_area

                                    pixel_x = pixel % width
                                    pixel_y = pixel // width

                                    macroblock = (ceil(pixel_x / 8.0), ceil(pixel_y / 8.0))
                                else:
                                    plane = "V"

                                    pixel -= u_plane_area

                                    pixel_x = pixel % width
                                    pixel_y = pixel // width

                                    macroblock = (ceil(pixel_x / 8.0), ceil(pixel_y / 8.0))

                                macroblock = tuple([ int(x) for x in macroblock ])

                                raise ComparisonError("%s differs from %s at frame %d, " \
                                                      "macroblock %s on the %s plane (offset %d)" % (
                                    file_a,
                                    file_b,
                                    frame,
                                    macroblock,
                                    plane,
                                    offs)
                                )

                    offset += chunk_size

    return _compare_yuv_output

def program_exists(program):
    def is_exe(fpath):
        return os.path.exists(fpath) and os.access(fpath, os.X_OK)

    fpath, fname = os.path.split(program)

    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None

class x264(Fixture):
    scm = x264git

class Compile(Case):
    @comparer(compare_pass)
    def test_configure(self):
        Popen([
            "make",
            "distclean"
        ], stdout=PIPE, stderr=STDOUT).communicate()

        configure_proc = Popen([
            "./configure"
        ] + self.fixture.dispatcher.configure, stdout=PIPE, stderr=STDOUT)

        output = configure_proc.communicate()[0]
        if configure_proc.returncode != 0:
            raise FailedTestError("configure failed: %s" % output.replace("\n", " "))

    @depends("configure")
    @comparer(compare_pass)
    def test_make(self):
        make_proc = Popen([
            "make",
            "-j5"
        ], stdout=PIPE, stderr=STDOUT)

        output = make_proc.communicate()[0]
        if make_proc.returncode != 0:
            raise FailedTestError("make failed: %s" % output.replace("\n", " "))

_dimension_pattern = re.compile(r"\w+ [[]info[]]: (\d+)x(\d+)[pi] \d+:\d+ @ \d+/\d+ fps [(][vc]fr[)]")

def _YUVOutputComparisonFactory():
    class YUVOutputComparison(Case):
        _dimension_pattern = _dimension_pattern

        depends = [ Compile ]
        options = []

        def __init__(self):
            for name, meth in inspect.getmembers(self):
                if name[:5] == "test_" and name[5:] not in self.fixture.dispatcher.yuv_tests:
                    delattr(self.__class__, name)

        def _run_x264(self):
            x264_proc = Popen([
                "./x264",
                "-o",
                "%s.264" % self.fixture.dispatcher.video,
                "--dump-yuv",
                "x264-output.yuv"
            ] + self.options + [
                self.fixture.dispatcher.video
            ], stdout=PIPE, stderr=STDOUT)

            output = x264_proc.communicate()[0]
            if x264_proc.returncode != 0:
                raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " "))

            matches = _dimension_pattern.match(output)

            return (int(matches.group(1)), int(matches.group(2)))

        @comparer(compare_pass)
        def test_jm(self):
            if not program_exists("ldecod"): raise DisabledTestError("jm unavailable")

            try:
                runres = self._run_x264()

                jm_proc = Popen([
                    "ldecod",
                    "-i",
                    "%s.264" % self.fixture.dispatcher.video,
                    "-o",
                    "jm-output.yuv"
                ], stdout=PIPE, stderr=STDOUT)

                output = jm_proc.communicate()[0]
                if jm_proc.returncode != 0:
                    raise FailedTestError("jm did not complete properly: %s" % output.replace("\n", " "))

                try:
                    compare_yuv_output(*runres)("x264-output.yuv", "jm-output.yuv")
                except ComparisonError, e:
                    raise FailedTestError(e)
            finally:
                try: os.remove("x264-output.yuv")
                except: pass

                try: os.remove("%s.264" % self.fixture.dispatcher.video)
                except: pass

                try: os.remove("jm-output.yuv")
                except: pass

                try: os.remove("log.dec")
                except: pass

                try: os.remove("dataDec.txt")
                except: pass

        @comparer(compare_pass)
        def test_ffmpeg(self):
            if not program_exists("ffmpeg"): raise DisabledTestError("ffmpeg unavailable")
            try:
                runres = self._run_x264()

                ffmpeg_proc = Popen([
                    "ffmpeg",
                    "-vsync 0",
                    "-i",
                    "%s.264" % self.fixture.dispatcher.video,
                    "ffmpeg-output.yuv"
                ], stdout=PIPE, stderr=STDOUT)

                output = ffmpeg_proc.communicate()[0]
                if ffmpeg_proc.returncode != 0:
                    raise FailedTestError("ffmpeg did not complete properly: %s" % output.replace("\n", " "))

                try:
                    compare_yuv_output(*runres)("x264-output.yuv", "ffmpeg-output.yuv")
                except ComparisonError, e:
                    raise FailedTestError(e)
            finally:
                try: os.remove("x264-output.yuv")
                except: pass

                try: os.remove("%s.264" % self.fixture.dispatcher.video)
                except: pass

                try: os.remove("ffmpeg-output.yuv")
                except: pass

    return YUVOutputComparison

class Regression(Case):
    depends = [ Compile ]

    _psnr_pattern = re.compile(r"x264 [[]info[]]: PSNR Mean Y:\d+[.]\d+ U:\d+[.]\d+ V:\d+[.]\d+ Avg:\d+[.]\d+ Global:(\d+[.]\d+) kb/s:\d+[.]\d+")
    _ssim_pattern = re.compile(r"x264 [[]info[]]: SSIM Mean Y:(\d+[.]\d+) [(]\d+[.]\d+db[)]")

    def __init__(self):
        if self.fixture.dispatcher.x264:
            self.__class__.__name__ += " %s" % " ".join(self.fixture.dispatcher.x264)

    def test_psnr(self):
        try:
            x264_proc = Popen([
                "./x264",
                "-o",
                "%s.264" % self.fixture.dispatcher.video,
                "--psnr"
            ] + self.fixture.dispatcher.x264 + [
                self.fixture.dispatcher.video
            ], stdout=PIPE, stderr=STDOUT)

            output = x264_proc.communicate()[0]

            if x264_proc.returncode != 0:
                raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " "))

            for line in output.split("\n"):
                if line.startswith("x264 [info]: PSNR Mean"):
                    return float(self._psnr_pattern.match(line).group(1))

            raise FailedTestError("no PSNR output caught from x264")
        finally:
            try: os.remove("%s.264" % self.fixture.dispatcher.video)
            except: pass

    def test_ssim(self):
        try:
            x264_proc = Popen([
                "./x264",
                "-o",
                "%s.264" % self.fixture.dispatcher.video,
                "--ssim"
            ] + self.fixture.dispatcher.x264 + [
                self.fixture.dispatcher.video
            ], stdout=PIPE, stderr=STDOUT)

            output = x264_proc.communicate()[0]

            if x264_proc.returncode != 0:
                raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " "))

            for line in output.split("\n"):
                if line.startswith("x264 [info]: SSIM Mean"):
                    return float(self._ssim_pattern.match(line).group(1))

            raise FailedTestError("no PSNR output caught from x264")
        finally:
            try: os.remove("%s.264" % self.fixture.dispatcher.video)
            except: pass

def _generate_random_commandline():
    commandline = []

    for suboptions in OPTIONS:
        commandline.append(suboptions[randrange(0, len(suboptions))])

    return filter(None, reduce(operator.add, [ shlex.split(opt) for opt in commandline ]))

_generated = []

fixture = x264()
fixture.register_case(Compile)

fixture.register_case(Regression)

class Dispatcher(_Dispatcher):
    video = "akiyo_qcif.y4m"
    products = 50
    configure = []
    x264 = []
    yuv_tests = [ "jm" ]

    def _populate_parser(self):
        super(Dispatcher, self)._populate_parser()

        # don't do a whole lot with this
        tcase = _YUVOutputComparisonFactory()

        yuv_tests = [ name[5:] for name, meth in filter(lambda pair: pair[0][:5] == "test_", inspect.getmembers(tcase)) ]

        group = OptionGroup(self.optparse, "x264 testing-specific options")

        group.add_option(
            "-v",
            "--video",
            metavar="FILENAME",
            action="callback",
            dest="video",
            type=str,
            callback=lambda option, opt, value, parser: setattr(self, "video", value),
            help="yuv video to perform testing on (default: %s)" % self.video
        )

        group.add_option(
            "-s",
            "--seed",
            metavar="SEED",
            action="callback",
            dest="seed",
            type=int,
            callback=lambda option, opt, value, parser: setattr(self, "seed", value),
            help="seed for the random number generator (default: unix timestamp)"
        )

        group.add_option(
            "-p",
            "--product-tests",
            metavar="NUM",
            action="callback",
            dest="video",
            type=int,
            callback=lambda option, opt, value, parser: setattr(self, "products", value),
            help="number of cartesian products to generate for yuv comparison testing (default: %d)" % self.products
        )

        group.add_option(
            "--configure-with",
            metavar="FLAGS",
            action="callback",
            dest="configure",
            type=str,
            callback=lambda option, opt, value, parser: setattr(self, "configure", shlex.split(value)),
            help="options to run ./configure with"
        )

        group.add_option(
            "--yuv-tests",
            action="callback",
            dest="yuv_tests",
            type=str,
            callback=lambda option, opt, value, parser: setattr(self, "yuv_tests", [
                val.strip() for val in value.split(",")
            ]),
            help="select tests to run with yuv comparisons (default: %s, available: %s)" % (
                ", ".join(self.yuv_tests),
                ", ".join(yuv_tests)
            )
        )

        group.add_option(
            "--x264-with",
            metavar="FLAGS",
            action="callback",
            dest="x264",
            type=str,
            callback=lambda option, opt, value, parser: setattr(self, "x264", shlex.split(value)),
            help="additional options to run ./x264 with"
        )

        self.optparse.add_option_group(group)

    def pre_dispatch(self):
        if not hasattr(self, "seed"):
            self.seed = int(time())

        print "Using seed: %d" % self.seed
        seed(self.seed)

        for i in xrange(self.products):
            YUVOutputComparison = _YUVOutputComparisonFactory()

            commandline = _generate_random_commandline()

            counter = 0

            while commandline in _generated:
                counter += 1
                commandline = _generate_random_commandline()

                if counter > 100:
                    print >>sys.stderr, "Maximum command-line regeneration exceeded. "  \
                                        "Try a different seed or specify fewer products to generate."
                    sys.exit(1)

            commandline += self.x264

            _generated.append(commandline)

            YUVOutputComparison.options = commandline
            YUVOutputComparison.__name__ = ("%s %s" % (YUVOutputComparison.__name__, " ".join(commandline)))

            fixture.register_case(YUVOutputComparison)

Dispatcher(fixture).dispatch()
x264-snapshot-20120103-2245-stable/tools/q_matrix_jvt.cfg0000644000175000001440000000271011700673342022221 0ustar  videolanusers# This an example configuration file for initializing the quantization matrix.
# Altogether 6 matrices for 4x4 blocks and 2 matrix for 8x8 blocks.
# The values range from 1 to 255.
# If first value of matrix is equal to 0, default values ("JVT") will be used
# for that matrix.
# If a matrix is completely omitted, it will be filled with 16s.
#
# Note: JM expects CHROMAU and CHROMAV to be specified separately, whereas
# x264 forces them to use the same matrix. If U and V are specified to have
# different matrices, only the first is used.
####################################################################################

INTRA4X4_LUMA =
 6,13,20,28,
13,20,28,32,
20,28,32,37,
28,32,37,42

INTRA4X4_CHROMAU =
 6,13,20,28,
13,20,28,32,
20,28,32,37,
28,32,37,42

INTRA4X4_CHROMAV =
 6,13,20,28,
13,20,28,32,
20,28,32,37,
28,32,37,42

INTER4X4_LUMA =
10,14,20,24,
14,20,24,27,
20,24,27,30,
24,27,30,34

INTER4X4_CHROMAU =
10,14,20,24,
14,20,24,27,
20,24,27,30,
24,27,30,34

INTER4X4_CHROMAV =
10,14,20,24,
14,20,24,27,
20,24,27,30,
24,27,30,34

INTRA8X8_LUMA =
 6,10,13,16,18,23,25,27,
10,11,16,18,23,25,27,29,
13,16,18,23,25,27,29,31,
16,18,23,25,27,29,31,33,
18,23,25,27,29,31,33,36,
23,25,27,29,31,33,36,38,
25,27,29,31,33,36,38,40,
27,29,31,33,36,38,40,42

INTER8X8_LUMA =
 9,13,15,17,19,21,22,24,
13,13,17,19,21,22,24,25,
15,17,19,21,22,24,25,27,
17,19,21,22,24,25,27,28,
19,21,22,24,25,27,28,30,
21,22,24,25,27,28,30,32,
22,24,25,27,28,30,32,33,
24,25,27,28,30,32,33,35

x264-snapshot-20120103-2245-stable/tools/digress/0000755000175000001440000000000011700673342020471 5ustar  videolanusersx264-snapshot-20120103-2245-stable/tools/digress/testing.py0000644000175000001440000004401211700673342022521 0ustar  videolanusers"""
Digress testing core.
"""

from digress.errors import SkippedTestError, DisabledTestError, NoSuchTestError, \
                           FailedTestError, AlreadyRunError, SCMError, \
                           ComparisonError
from digress.constants import *
from digress.cli import dispatchable

import inspect
import operator
import os
import json

import textwrap

from shutil import rmtree

from time import time
from functools import wraps

from itertools import izip_longest

from hashlib import sha1

class depends(object):
    """
    Dependency decorator for a test.
    """
    def __init__(self, *test_names):
        self.test_names = test_names

    def __call__(self, func):
        func.digress_depends = self.test_names
        return func

class _skipped(object):
    """
    Internal skipped decorator.
    """
    def __init__(self, reason=""):
        self._reason = reason

    def __call__(self, func):
        @wraps(func)
        def _closure(*args):
            raise SkippedTestError(self._reason)
        return _closure

class disabled(object):
    """
    Disable a test, with reason.
    """
    def __init__(self, reason=""):
        self._reason = reason

    def __call__(self, func):
        @wraps(func)
        def _closure(*args):
            raise DisabledTestError(self._reason)
        return _closure

class comparer(object):
    """
    Set the comparer for a test.
    """
    def __init__(self, comparer_):
        self._comparer = comparer_

    def __call__(self, func):
        func.digress_comparer = self._comparer
        return func

class Fixture(object):
    cases = []
    scm = None

    flush_before = False

    def _skip_case(self, case, depend):
        for name, meth in inspect.getmembers(case):
            if name[:5] == "test_":
                setattr(
                    case,
                    name,
                    _skipped("failed dependency: case %s" % depend)(meth)
                )

    def _run_case(self, case, results):
        if case.__name__ in results:
            raise AlreadyRunError

        for depend in case.depends:
            if depend.__name__ in results and results[depend.__name__]["status"] != CASE_PASS:
                self._skip_case(case, depend.__name__)

            try:
                result = self._run_case(depend, results)
            except AlreadyRunError:
                continue

            if result["status"] != CASE_PASS:
                self._skip_case(case, depend.__name__)

        result = case().run()
        results[case.__name__] = result
        return result

    @dispatchable
    def flush(self, revision=None):
        """
        Flush any cached results. Takes a revision for an optional argument.
        """
        if not revision:
            print "Flushing all cached results...",

            try:
                rmtree(".digress_%s" % self.__class__.__name__)
            except Exception, e:
                print "failed: %s" % e
            else:
                print "done."
        else:
            try:
                rev = self.scm.rev_parse(revision)
            except SCMError, e:
                print e
            else:
                print "Flushing cached results for %s..." % rev,

                try:
                    rmtree(os.path.join(".digress_%s" % self.__class__.__name__, rev))
                except Exception, e:
                    print "failed: %s" % e
                else:
                    print "done."

    @dispatchable
    def run(self, revision=None):
        """
        Run the fixture for a specified revision.

        Takes a revision for an argument.
        """
        oldrev = None
        oldbranch = None
        dirty = False

        try:
            dirty = self.scm.dirty()

            # if the tree is clean, then we don't need to make an exception
            if not dirty and revision is None: revision = "HEAD"

            if revision:
                oldrev = self.scm.current_rev()
                oldbranch = self.scm.current_branch()

                if dirty:
                    self.scm.stash()
                self.scm.checkout(revision)

                rev = self.scm.current_rev()

                self.datastore = os.path.join(".digress_%s" % self.__class__.__name__, rev)

                if os.path.isdir(self.datastore):
                    if self.flush_before:
                        self.flush(rev)
                else:
                    os.makedirs(self.datastore)
            else:
                rev = "(dirty working tree)"
                self.datastore = None

            print "Running fixture %s on revision %s...\n" % (self.__class__.__name__, rev)

            results = {}

            for case in self.cases:
                try:
                    self._run_case(case, results)
                except AlreadyRunError:
                    continue

            total_time = reduce(operator.add, filter(
                None,
                [
                    result["time"] for result in results.values()
                ]
            ), 0)

            overall_status = (
                CASE_FAIL in [ result["status"] for result in results.values() ]
            ) and FIXTURE_FAIL or FIXTURE_PASS

            print "Fixture %s in %.4f.\n" % (
                (overall_status == FIXTURE_PASS) and "passed" or "failed",
                total_time
            )

            return { "cases" : results, "time" : total_time, "status" : overall_status, "revision" : rev }

        finally:
            if oldrev:
                self.scm.checkout(oldrev)
                if oldbranch:
                    self.scm.checkout(oldbranch)
                if dirty:
                    self.scm.unstash()

    @dispatchable
    def bisect(self, good_rev, bad_rev=None):
        """
        Perform a bisection between two revisions.

        First argument is the good revision, second is the bad revision, which
        defaults to the current revision.
        """
        if not bad_rev: bad_rev = self.scm.current_rev()

        dirty = False

        # get a set of results for the good revision
        good_result = self.run(good_rev)

        good_rev = good_result["revision"]

        try:
            dirty = self.scm.dirty()

            if dirty:
                self.scm.stash()

            self.scm.bisect("start")

            self.scm.bisect("bad", bad_rev)
            self.scm.bisect("good", good_rev)

            bisecting = True
            isbad = False

            while bisecting:
                results = self.run(self.scm.current_rev())
                revision = results["revision"]

                # perform comparisons
                # FIXME: this just uses a lot of self.compare
                for case_name, case_result in good_result["cases"].iteritems():
                    case = filter(lambda case: case.__name__ == case_name, self.cases)[0]

                    for test_name, test_result in case_result["tests"].iteritems():
                        test = filter(
                            lambda pair: pair[0] == "test_%s" % test_name,
                            inspect.getmembers(case)
                        )[0][1]

                        other_result = results["cases"][case_name]["tests"][test_name]

                        if other_result["status"] == TEST_FAIL and case_result["status"] != TEST_FAIL:
                            print "Revision %s failed %s.%s." % (revision, case_name, test_name)
                            isbad = True
                            break

                        elif hasattr(test, "digress_comparer"):
                            try:
                                test.digress_comparer(test_result["value"], other_result["value"])
                            except ComparisonError, e:
                                print "%s differs: %s" % (test_name, e)
                                isbad = True
                                break

                if isbad:
                    output = self.scm.bisect("bad", revision)
                    print "Marking revision %s as bad." % revision
                else:
                    output = self.scm.bisect("good", revision)
                    print "Marking revision %s as good." % revision

                if output.split("\n")[0].endswith("is the first bad commit"):
                    print "\nBisection complete.\n"
                    print output
                    bisecting = False

                print ""
        except SCMError, e:
            print e
        finally:
            self.scm.bisect("reset")

            if dirty:
                self.scm.unstash()

    @dispatchable
    def multicompare(self, rev_a=None, rev_b=None, mode="waterfall"):
        """
        Generate a comparison of tests.

        Takes three optional arguments, from which revision, to which revision,
        and the method of display (defaults to vertical "waterfall", also
        accepts "river" for horizontal display)
        """
        if not rev_a: rev_a = self.scm.current_rev()
        if not rev_b: rev_b = self.scm.current_rev()

        revisions = self.scm.revisions(rev_a, rev_b)

        results = []

        for revision in revisions:
            results.append(self.run(revision))

        test_names = reduce(operator.add, [
            [
                (case_name, test_name)
                for
                    test_name, test_result
                in
                    case_result["tests"].iteritems()
            ]
            for
                case_name, case_result
            in
                results[0]["cases"].iteritems()
        ], [])

        MAXLEN = 20

        colfmt = "| %s "

        table = []

        if mode not in ("waterfall", "river"):
            mode = "waterfall"

            print "Unknown multicompare mode specified, defaulting to %s." % mode

        if mode == "waterfall":
            header = [ "Test" ]

            for result in results:
                header.append(result["revision"])

            table.append(header)

            for test_name in test_names:
                row_data = [ ".".join(test_name) ]

                for result in results:
                    test_result = result["cases"][test_name[0]]["tests"][test_name[1]]

                    if test_result["status"] != TEST_PASS:
                        value = "did not pass: %s" % (test_result["value"])
                    else:
                        value = "%s (%.4f)" % (test_result["value"], test_result["time"])

                    row_data.append(value)

                table.append(row_data)

        elif mode == "river":
            header = [ "Revision" ]

            for test_name in test_names:
                header.append(".".join(test_name))

            table.append(header)

            for result in results:
                row_data = [ result["revision"] ]

                for case_name, case_result in result["cases"].iteritems():
                    for test_name, test_result in case_result["tests"].iteritems():

                        if test_result["status"] != TEST_PASS:
                            value = "did not pass: %s" % (test_result["value"])
                        else:
                            value = "%s (%.4f)" % (test_result["value"], test_result["time"])

                        row_data.append(value)

                table.append(row_data)

        breaker = "=" * (len(colfmt % "".center(MAXLEN)) * len(table[0]) + 1)

        print breaker

        for row in table:
            for row_stuff in izip_longest(*[
                textwrap.wrap(col, MAXLEN, break_on_hyphens=False) for col in row
            ], fillvalue=""):
                row_output = ""

                for col in row_stuff:
                    row_output += colfmt % col.ljust(MAXLEN)

                row_output += "|"

                print row_output
            print breaker

    @dispatchable
    def compare(self, rev_a, rev_b=None):
        """
        Compare two revisions directly.

        Takes two arguments, second is optional and implies current revision.
        """
        results_a = self.run(rev_a)
        results_b = self.run(rev_b)

        for case_name, case_result in results_a["cases"].iteritems():
            case = filter(lambda case: case.__name__ == case_name, self.cases)[0]

            header = "Comparison of case %s" % case_name
            print header
            print "=" * len(header)

            for test_name, test_result in case_result["tests"].iteritems():
                test = filter(
                    lambda pair: pair[0] == "test_%s" % test_name,
                    inspect.getmembers(case)
                )[0][1]

                other_result = results_b["cases"][case_name]["tests"][test_name]

                if test_result["status"] != TEST_PASS or other_result["status"] != TEST_PASS:
                    print "%s cannot be compared as one of the revisions have not passed it." % test_name

                elif hasattr(test, "digress_comparer"):
                    try:
                        test.digress_comparer(test_result["value"], other_result["value"])
                    except ComparisonError, e:
                        print "%s differs: %s" % (test_name, e)
                    else:
                        print "%s does not differ." % test_name
                else:
                    print "%s has no comparer and therefore cannot be compared." % test_name

            print ""

    @dispatchable
    def list(self):
        """
        List all available test cases, excluding dependencies.
        """
        print "\nAvailable Test Cases"
        print "===================="
        for case in self.cases:
            print case.__name__

    def register_case(self, case):
        case.fixture = self
        self.cases.append(case)

class Case(object):
    depends = []
    fixture = None

    def _get_test_by_name(self, test_name):
        if not hasattr(self, "test_%s" % test_name):
            raise NoSuchTestError(test_name)
        return getattr(self, "test_%s" % test_name)

    def _run_test(self, test, results):
        test_name = test.__name__[5:]

        if test_name in results:
            raise AlreadyRunError

        if hasattr(test, "digress_depends"):
            for depend in test.digress_depends:
                if depend in results and results[depend]["status"] != TEST_PASS:
                    test = _skipped("failed dependency: %s" % depend)(test)

                dependtest = self._get_test_by_name(depend)

                try:
                    result = self._run_test(dependtest, results)
                except AlreadyRunError:
                    continue

                if result["status"] != TEST_PASS:
                    test = _skipped("failed dependency: %s" % depend)(test)

        start_time = time()
        run_time = None

        print "Running test %s..." % test_name,

        try:
            if not self.datastore:
                # XXX: this smells funny
                raise IOError

            with open(os.path.join(
                self.datastore,
                "%s.json" % sha1(test_name).hexdigest()
            ), "r") as f:
                result = json.load(f)

            value = str(result["value"])

            if result["status"] == TEST_DISABLED:
                status = "disabled"
            elif result["status"] == TEST_SKIPPED:
                status = "skipped"
            elif result["status"] == TEST_FAIL:
                status = "failed"
            elif result["status"] == TEST_PASS:
                status = "passed"
                value = "%s (in %.4f)" % (
                    result["value"] or "(no result)",
                    result["time"]
                )
            else:
                status = "???"

            print "%s (cached): %s" % (status, value)
        except IOError:
            try:
                value = test()
            except DisabledTestError, e:
                print "disabled: %s" % e
                status = TEST_DISABLED
                value = str(e)
            except SkippedTestError, e:
                print "skipped: %s" % e
                status = TEST_SKIPPED
                value = str(e)
            except FailedTestError, e:
                print "failed: %s" % e
                status = TEST_FAIL
                value = str(e)
            except Exception, e:
                print "failed with exception: %s" % e
                status = TEST_FAIL
                value = str(e)
            else:
                run_time = time() - start_time
                print "passed: %s (in %.4f)" % (
                    value or "(no result)",
                    run_time
                )
                status = TEST_PASS

            result = { "status" : status, "value" : value, "time" : run_time }

            if self.datastore:
                with open(os.path.join(
                    self.datastore,
                    "%s.json" % sha1(test_name).hexdigest()
                ), "w") as f:
                    json.dump(result, f)

        results[test_name] = result
        return result

    def run(self):
        print "Running case %s..." % self.__class__.__name__

        if self.fixture.datastore:
            self.datastore = os.path.join(
                self.fixture.datastore,
                sha1(self.__class__.__name__).hexdigest()
            )
            if not os.path.isdir(self.datastore):
                os.makedirs(self.datastore)
        else:
            self.datastore = None

        results = {}

        for name, meth in inspect.getmembers(self):
            if name[:5] == "test_":
                try:
                    self._run_test(meth, results)
                except AlreadyRunError:
                    continue

        total_time = reduce(operator.add, filter(
            None, [
                result["time"] for result in results.values()
            ]
        ), 0)

        overall_status = (
            TEST_FAIL in [ result["status"] for result in results.values() ]
        ) and CASE_FAIL or CASE_PASS

        print "Case %s in %.4f.\n" % (
            (overall_status == FIXTURE_PASS) and "passed" or "failed",
            total_time
        )

        return { "tests" : results, "time" : total_time, "status" : overall_status }
x264-snapshot-20120103-2245-stable/tools/digress/scm/0000755000175000001440000000000011700673342021253 5ustar  videolanusersx264-snapshot-20120103-2245-stable/tools/digress/scm/git.py0000644000175000001440000000524711700673342022420 0ustar  videolanusers"""
Git SCM backend for Digress.
"""

from subprocess import Popen, PIPE, STDOUT
import re

from digress.errors import SCMError

GIT_BRANCH_EXPR = re.compile("[*] (.*)")

def checkout(revision):
    """
    Checkout a revision from git.
    """
    proc = Popen([
        "git",
        "checkout",
        "-f",
        revision
    ], stdout=PIPE, stderr=STDOUT)

    output = proc.communicate()[0].strip()
    if proc.returncode != 0:
        raise SCMError("checkout error: %s" % output)

def rev_parse(ref):
    proc = Popen([
        "git",
        "rev-parse",
        ref
    ], stdout=PIPE, stderr=STDOUT)

    output = proc.communicate()[0].strip()
    if proc.returncode != 0:
        raise SCMError("rev-parse error: %s" % output)
    return output

def current_rev():
    """
    Get the current revision.
    """
    return rev_parse("HEAD")

def current_branch():
    """
    Get the current branch.
    """
    proc = Popen([
        "git",
        "branch",
        "--no-color"
    ], stdout=PIPE, stderr=STDOUT)

    output = proc.communicate()[0].strip()
    if proc.returncode != 0:
        raise SCMError("branch error: %s" % output)
    branch_name = GIT_BRANCH_EXPR.findall(output)[0]
    return branch_name != "(no branch)" and branch_name or None

def revisions(rev_a, rev_b):
    """
    Get a list of revisions from one to another.
    """
    proc = Popen([
        "git",
        "log",
        "--format=%H", ("%s...%s" % (rev_a, rev_b))
    ], stdout=PIPE, stderr=STDOUT)

    output = proc.communicate()[0].strip()
    if proc.returncode != 0:
        raise SCMError("log error: %s" % output)
    return output.split("\n")

def stash():
    """
    Stash the repository.
    """
    proc = Popen([
        "git",
        "stash",
        "save",
        "--keep-index"
    ], stdout=PIPE, stderr=STDOUT)

    output = proc.communicate()[0].strip()
    if proc.returncode != 0:
        raise SCMError("stash error: %s" % output)

def unstash():
    """
    Unstash the repository.
    """
    proc = Popen(["git", "stash", "pop"], stdout=PIPE, stderr=STDOUT)
    proc.communicate()

def bisect(*args):
    """
    Perform a bisection.
    """
    proc = Popen((["git", "bisect"] + list(args)), stdout=PIPE, stderr=STDOUT)
    output = proc.communicate()[0]
    if proc.returncode != 0:
        raise SCMError("bisect error: %s" % output)
    return output

def dirty():
    """
    Check if the working tree is dirty.
    """
    proc = Popen(["git", "status"], stdout=PIPE, stderr=STDOUT)
    output = proc.communicate()[0].strip()
    if proc.returncode != 0:
        raise SCMError("status error: %s" % output)
    if "modified:" in output:
        return True
    else:
        return False
x264-snapshot-20120103-2245-stable/tools/digress/scm/dummy.py0000644000175000001440000000115411700673342022761 0ustar  videolanusers"""
Dummy SCM backend for Digress.
"""

from random import random

def checkout(revision):
    """
    Checkout a revision.
    """
    pass

def current_rev():
    """
    Get the current revision
    """
    return str(random())

def revisions(rev_a, rev_b):
    """
    Get a list of revisions from one to another.
    """
    pass

def stash():
    """
    Stash the repository.
    """
    pass

def unstash():
    """
    Unstash the repository.
    """
    pass

def bisect(command, revision):
    """
    Perform a bisection.
    """
    raise NotImplementedError("dummy SCM backend does not support bisection")
x264-snapshot-20120103-2245-stable/tools/digress/scm/__init__.py0000644000175000001440000000005511700673342023364 0ustar  videolanusers"""
Source control backends for Digress.
"""
x264-snapshot-20120103-2245-stable/tools/digress/errors.py0000644000175000001440000000205111700673342022355 0ustar  videolanusers"""
Digress errors.
"""

class DigressError(Exception):
    """
    Digress error base class.
    """

class NoSuchTestError(DigressError):
    """
    Raised when no such test exists.
    """

class DisabledTestError(DigressError):
    """
    Test is disabled.
    """

class SkippedTestError(DigressError):
    """
    Test is marked as skipped.
    """

class DisabledCaseError(DigressError):
    """
    Case is marked as disabled.
    """

class SkippedCaseError(DigressError):
    """
    Case is marked as skipped.
    """

class FailedTestError(DigressError):
    """
    Test failed.
    """

class ComparisonError(DigressError):
    """
    Comparison failed.
    """

class IncomparableError(DigressError):
    """
    Values cannot be compared.
    """

class AlreadyRunError(DigressError):
    """
    Test/case has already been run.
    """

class SCMError(DigressError):
    """
    Error occurred in SCM.
    """
    def __init__(self, message):
        self.message = message.replace("\n", " ")

    def __str__(self):
        return self.message
x264-snapshot-20120103-2245-stable/tools/digress/constants.py0000644000175000001440000000024411700673342023057 0ustar  videolanusers"""
All of Digress's constants.
"""

TEST_PASS = 0
TEST_FAIL = 1
TEST_DISABLED = 2
TEST_SKIPPED = 3

CASE_PASS = 0
CASE_FAIL = 1

FIXTURE_PASS = 0
FIXTURE_FAIL = 1
x264-snapshot-20120103-2245-stable/tools/digress/comparers.py0000644000175000001440000000351211700673342023037 0ustar  videolanusers"""
Digress comparers.
"""

from digress.errors import ComparisonError

import os
from itertools import imap, izip

def compare_direct(value_a, value_b):
    if value_a != value_b:
        raise ComparisonError("%s is not %s" % (value_a, value_b))

def compare_pass(value_a, value_b):
    """
    Always true, as long as the test is passed.
    """

def compare_tolerance(tolerance):
    def _compare_tolerance(value_a, value_b):
        if abs(value_a - value_b) > tolerance:
            raise ComparisonError("%s is not %s (tolerance: %s)" % (
                value_a,
                value_b,
                tolerance
            ))
    return _compare_tolerance

def compare_files(file_a, file_b):
    size_a = os.path.getsize(file_a)
    size_b = os.path.getsize(file_b)

    print file_a, file_b

    if size_a != size_b:
        raise ComparisonError("%s is not the same size as %s" % (
            file_a,
            file_b
        ))

    BUFFER_SIZE = 8196

    offset = 0

    with open(file_a) as f_a:
        with open(file_b) as f_b:
            for chunk_a, chunk_b in izip(
                imap(
                    lambda i: f_a.read(BUFFER_SIZE),
                    xrange(size_a // BUFFER_SIZE + 1)
                ),
                imap(
                    lambda i: f_b.read(BUFFER_SIZE),
                    xrange(size_b // BUFFER_SIZE + 1)
                )
            ):
                chunk_size = len(chunk_a)

                if chunk_a != chunk_b:
                    for i in xrange(chunk_size):
                        if chunk_a[i] != chunk_b[i]:
                            raise ComparisonError("%s differs from %s at offset %d" % (
                                file_a,
                                file_b,
                                offset + i
                            ))

                offset += chunk_size
x264-snapshot-20120103-2245-stable/tools/digress/cli.py0000644000175000001440000001047511700673342021621 0ustar  videolanusers"""
Digress's CLI interface.
"""

import inspect
import sys
from optparse import OptionParser

import textwrap

from types import MethodType

from digress import __version__ as version

def dispatchable(func):
    """
    Mark a method as dispatchable.
    """
    func.digress_dispatchable = True
    return func

class Dispatcher(object):
    """
    Dispatcher for CLI commands.
    """
    def __init__(self, fixture):
        self.fixture = fixture
        fixture.dispatcher = self

    def _monkey_print_help(self, optparse, *args, **kwargs):
        # monkey patches OptionParser._print_help
        OptionParser.print_help(optparse, *args, **kwargs)

        print >>sys.stderr, "\nAvailable commands:"

        maxlen = max([ len(command_name) for command_name in self.commands ])

        descwidth = 80 - maxlen - 4

        for command_name, command_meth in self.commands.iteritems():
            print >>sys.stderr, "  %s %s\n" % (
                command_name.ljust(maxlen + 1),
                ("\n" + (maxlen + 4) * " ").join(
                    textwrap.wrap(" ".join(filter(
                            None,
                            command_meth.__doc__.strip().replace("\n", " ").split(" ")
                        )),
                        descwidth
                    )
                )
            )

    def _enable_flush(self):
        self.fixture.flush_before = True

    def _populate_parser(self):
        self.commands = self._get_commands()

        self.optparse = OptionParser(
            usage = "usage: %prog [options] command [args]",
            description = "Digress CLI frontend for %s." % self.fixture.__class__.__name__,
            version = "Digress %s" % version
        )

        self.optparse.print_help = MethodType(self._monkey_print_help, self.optparse, OptionParser)

        self.optparse.add_option(
            "-f",
            "--flush",
            action="callback",
            callback=lambda option, opt, value, parser: self._enable_flush(),
            help="flush existing data for a revision before testing"
        )

        self.optparse.add_option(
            "-c",
            "--cases",
            metavar="FOO,BAR",
            action="callback",
            dest="cases",
            type=str,
            callback=lambda option, opt, value, parser: self._select_cases(*value.split(",")),
            help="test cases to run, run with command list to see full list"
        )

    def _select_cases(self, *cases):
        self.fixture.cases = filter(lambda case: case.__name__ in cases, self.fixture.cases)

    def _get_commands(self):
        commands = {}

        for name, member in inspect.getmembers(self.fixture):
            if hasattr(member, "digress_dispatchable"):
                commands[name] = member

        return commands

    def _run_command(self, name, *args):
        if name not in self.commands:
            print >>sys.stderr, "error: %s is not a valid command\n" % name
            self.optparse.print_help()
            return

        command = self.commands[name]

        argspec = inspect.getargspec(command)

        max_arg_len = len(argspec.args) - 1
        min_arg_len = max_arg_len - ((argspec.defaults is not None) and len(argspec.defaults) or 0)

        if len(args) < min_arg_len:
            print >>sys.stderr, "error: %s takes at least %d arguments\n" % (
                name,
                min_arg_len
            )
            print >>sys.stderr, "%s\n" % command.__doc__
            self.optparse.print_help()
            return

        if len(args) > max_arg_len:
            print >>sys.stderr, "error: %s takes at most %d arguments\n" % (
                name,
                max_arg_len
            )
            print >>sys.stderr, "%s\n" % command.__doc__
            self.optparse.print_help()
            return

        command(*args)

    def pre_dispatch(self):
        pass

    def dispatch(self):
        self._populate_parser()

        self.optparse.parse_args()
        self.pre_dispatch()
        args = self.optparse.parse_args()[1] # arguments may require reparsing after pre_dispatch; see test_x264.py

        if len(args) == 0:
            print >>sys.stderr, "error: no comamnd specified\n"
            self.optparse.print_help()
            return

        command = args[0]
        addenda = args[1:]

        self._run_command(command, *addenda)
x264-snapshot-20120103-2245-stable/tools/digress/__init__.py0000644000175000001440000000033411700673342022602 0ustar  videolanusers"""
Automated regression/unit testing suite.
"""

__version__ = '0.2'

def digress(fixture):
    """
    Command-line helper for Digress.
    """
    from digress.cli import Dispatcher
    Dispatcher(fixture).dispatch()
x264-snapshot-20120103-2245-stable/tools/countquant_x264.pl0000755000175000001440000000252411700673342022360 0ustar  videolanusers#!/bin/env perl
# countquant_x264.pl: displays statistics from x264 multipass logfiles
# by Loren Merritt, 2005-4-5

@size{I,P,B} =
@n{I,P,B} = (0)x3;

sub proc_file {
    my $fh = shift;
    while(<$fh>) {
        /type:(.) q:(\d+\.\d+) tex:(\d+) mv:(\d+) misc:(\d+)/ or next;
	$type = uc $1;
	$n{$type} ++;
	$q[int($2+.5)] ++;
	$avgq += $2;
	$avgq{$type} += $2;
        my $bytes = ($3+$4+$5)/8;
	$size{$type} += $bytes;
    }
    $size = $size{I} + $size{P} + $size{B};
    $n = $n{I} + $n{P} + $n{B};
    $n or die "unrecognized input\n";
}

if(@ARGV) {
    foreach(@ARGV) {
        open $fh, "<", $_ or die "can't open '$_': $!";
	proc_file($fh);
    }
} else {
    proc_file(STDIN);
}

for(0..51) {
    $q[$_] or next;
    printf "q%2d: %6d  %4.1f%%\n", $_, $q[$_], 100*$q[$_]/$n;
}
print "\n";
$digits = int(log($n+1)/log(10))+2;
printf "All: %${digits}d        %s  avgQP:%5.2f  avgBytes:%5d\n",
    $n, $n==$n{I}?" ":"", $avgq/$n, $size/$n;
foreach(qw(I P B S)) {
    $n{$_} or next;
    printf "%s:   %${digits}d (%4.1f%%)  avgQP:%5.2f  avgBytes:%5d\n",
        $_, $n{$_}, 100*$n{$_}/$n, $avgq{$_}/$n{$_}, $size{$_}/$n{$_};
}
print "\n";
printf "total size: $size B = %.2f KiB = %.2f MiB\n",
    $size/2**10, $size/2**20;
print "bitrate: ", join("\n       = ",
    map sprintf("%.2f kbps @ %s fps", $_*$size*8/1000/$n, $_),
    23.976, 25, 29.97), "\n";
x264-snapshot-20120103-2245-stable/tools/checkasm.c0000644000175000001440000027455411700673342020774 0ustar  videolanusers/*****************************************************************************
 * checkasm.c: assembly check tool
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include <ctype.h>
#include "common/common.h"
#include "common/cpu.h"

// GCC doesn't align stack variables on ARM, so use .bss
#if ARCH_ARM
#undef ALIGNED_16
#define ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
#endif

/* buf1, buf2: initialised to random data and shouldn't write into them */
uint8_t *buf1, *buf2;
/* buf3, buf4: used to store output */
uint8_t *buf3, *buf4;
/* pbuf1, pbuf2: initialised to random pixel data and shouldn't write into them. */
pixel *pbuf1, *pbuf2;
/* pbuf3, pbuf4: point to buf3, buf4, just for type convenience */
pixel *pbuf3, *pbuf4;

int quiet = 0;

#define report( name ) { \
    if( used_asm && !quiet ) \
        fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
    if( !ok ) ret = -1; \
}

#define BENCH_RUNS 100  // tradeoff between accuracy and speed
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
#define MAX_FUNCS 1000  // just has to be big enough to hold all the existing functions
#define MAX_CPUS 30     // number of different combinations of cpu flags

typedef struct
{
    void *pointer; // just for detecting duplicates
    uint32_t cpu;
    uint32_t cycles;
    uint32_t den;
} bench_t;

typedef struct
{
    char *name;
    bench_t vers[MAX_CPUS];
} bench_func_t;

int do_bench = 0;
int bench_pattern_len = 0;
const char *bench_pattern = "";
char func_name[100];
static bench_func_t benchs[MAX_FUNCS];

static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" };
static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" };
static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" };
static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" };
static const char **intra_predict_8x8_names = intra_predict_4x4_names;
static const char **intra_predict_8x16c_names = intra_predict_8x8c_names;

#define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )

static inline uint32_t read_time(void)
{
    uint32_t a = 0;
#if HAVE_X86_INLINE_ASM
    asm volatile( "rdtsc" :"=a"(a) ::"edx" );
#elif ARCH_PPC
    asm volatile( "mftb %0" : "=r" (a) );
#elif ARCH_ARM     // ARMv7 only
    asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
#endif
    return a;
}

static bench_t* get_bench( const char *name, int cpu )
{
    int i, j;
    for( i = 0; benchs[i].name && strcmp(name, benchs[i].name); i++ )
        assert( i < MAX_FUNCS );
    if( !benchs[i].name )
        benchs[i].name = strdup( name );
    if( !cpu )
        return &benchs[i].vers[0];
    for( j = 1; benchs[i].vers[j].cpu && benchs[i].vers[j].cpu != cpu; j++ )
        assert( j < MAX_CPUS );
    benchs[i].vers[j].cpu = cpu;
    return &benchs[i].vers[j];
}

static int cmp_nop( const void *a, const void *b )
{
    return *(uint16_t*)a - *(uint16_t*)b;
}

static int cmp_bench( const void *a, const void *b )
{
    // asciibetical sort except preserving numbers
    const char *sa = ((bench_func_t*)a)->name;
    const char *sb = ((bench_func_t*)b)->name;
    for( ;; sa++, sb++ )
    {
        if( !*sa && !*sb )
            return 0;
        if( isdigit( *sa ) && isdigit( *sb ) && isdigit( sa[1] ) != isdigit( sb[1] ) )
            return isdigit( sa[1] ) - isdigit( sb[1] );
        if( *sa != *sb )
            return *sa - *sb;
    }
}

static void print_bench(void)
{
    uint16_t nops[10000] = {0};
    int nfuncs, nop_time=0;

    for( int i = 0; i < 10000; i++ )
    {
        int t = read_time();
        nops[i] = read_time() - t;
    }
    qsort( nops, 10000, sizeof(uint16_t), cmp_nop );
    for( int i = 500; i < 9500; i++ )
        nop_time += nops[i];
    nop_time /= 900;
    printf( "nop: %d\n", nop_time );

    for( nfuncs = 0; nfuncs < MAX_FUNCS && benchs[nfuncs].name; nfuncs++ );
    qsort( benchs, nfuncs, sizeof(bench_func_t), cmp_bench );
    for( int i = 0; i < nfuncs; i++ )
        for( int j = 0; j < MAX_CPUS && (!j || benchs[i].vers[j].cpu); j++ )
        {
            int k;
            bench_t *b = &benchs[i].vers[j];
            if( !b->den )
                continue;
            for( k = 0; k < j && benchs[i].vers[k].pointer != b->pointer; k++ );
            if( k < j )
                continue;
            printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
                    b->cpu&X264_CPU_FMA4 ? "fma4" :
                    b->cpu&X264_CPU_XOP ? "xop" :
                    b->cpu&X264_CPU_AVX ? "avx" :
                    b->cpu&X264_CPU_SSE4 ? "sse4" :
                    b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                    b->cpu&X264_CPU_SSE3 ? "sse3" :
                    /* print sse2slow only if there's also a sse2fast version of the same func */
                    b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
                    b->cpu&X264_CPU_SSE2 ? "sse2" :
                    b->cpu&X264_CPU_MMX ? "mmx" :
                    b->cpu&X264_CPU_ALTIVEC ? "altivec" :
                    b->cpu&X264_CPU_NEON ? "neon" :
                    b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
                    b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
                    b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                    b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
                    b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
                    b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
                    b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
                    ((int64_t)10*b->cycles/b->den - nop_time)/4 );
        }
}

#if ARCH_X86 || ARCH_X86_64
int x264_stack_pagealign( int (*func)(), int align );
#else
#define x264_stack_pagealign( func, align ) func()
#endif

#define call_c1(func,...) func(__VA_ARGS__)

#if ARCH_X86 || defined(_WIN64)
/* detect when callee-saved regs aren't saved.
 * needs an explicit asm check because it only sometimes crashes in normal use. */
intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
#define call_a1(func,...) x264_checkasm_call((intptr_t(*)())func, &ok, __VA_ARGS__)
#else
#define call_a1 call_c1
#endif

#define call_bench(func,cpu,...)\
    if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\
    {\
        uint32_t tsum = 0;\
        int tcount = 0;\
        call_a1(func, __VA_ARGS__);\
        for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
        {\
            uint32_t t = read_time();\
            func(__VA_ARGS__);\
            func(__VA_ARGS__);\
            func(__VA_ARGS__);\
            func(__VA_ARGS__);\
            t = read_time() - t;\
            if( t*tcount <= tsum*4 && ti > 0 )\
            {\
                tsum += t;\
                tcount++;\
            }\
        }\
        bench_t *b = get_bench( func_name, cpu );\
        b->cycles += tsum;\
        b->den += tcount;\
        b->pointer = func;\
    }

/* for most functions, run benchmark and correctness test at the same time.
 * for those that modify their inputs, run the above macros separately */
#define call_a(func,...) ({ call_a2(func,__VA_ARGS__); call_a1(func,__VA_ARGS__); })
#define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); })
#define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); })
#define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); })


static int check_pixel( int cpu_ref, int cpu_new )
{
    x264_pixel_function_t pixel_c;
    x264_pixel_function_t pixel_ref;
    x264_pixel_function_t pixel_asm;
    x264_predict_t predict_4x4[12];
    x264_predict8x8_t predict_8x8[12];
    x264_predict_8x8_filter_t predict_8x8_filter;
    ALIGNED_16( pixel edge[36] );
    uint16_t cost_mv[32];
    int ret = 0, ok, used_asm;

    x264_pixel_init( 0, &pixel_c );
    x264_pixel_init( cpu_ref, &pixel_ref );
    x264_pixel_init( cpu_new, &pixel_asm );
    x264_predict_4x4_init( 0, predict_4x4 );
    x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter );
    predict_8x8_filter( pbuf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );

    // maximize sum
    for( int i = 0; i < 256; i++ )
    {
        int z = i|(i>>4);
        z ^= z>>2;
        z ^= z>>1;
        pbuf4[i] = -(z&1) & PIXEL_MAX;
        pbuf3[i] = ~pbuf4[i] & PIXEL_MAX;
    }
    // random pattern made of maxed pixel differences, in case an intermediate value overflows
    for( int i = 256; i < 0x1000; i++ )
    {
        pbuf4[i] = -(pbuf1[i&~0x88]&1) & PIXEL_MAX;
        pbuf3[i] = ~(pbuf4[i]) & PIXEL_MAX;
    }

#define TEST_PIXEL( name, align ) \
    ok = 1, used_asm = 0; \
    for( int i = 0; i < 8; i++ ) \
    { \
        int res_c, res_asm; \
        if( pixel_asm.name[i] != pixel_ref.name[i] ) \
        { \
            set_func_name( "%s_%s", #name, pixel_names[i] ); \
            used_asm = 1; \
            for( int j = 0; j < 64; j++ ) \
            { \
                res_c   = call_c( pixel_c.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \
                res_asm = call_a( pixel_asm.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \
                if( res_c != res_asm ) \
                { \
                    ok = 0; \
                    fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
                    break; \
                } \
            } \
            for( int j = 0; j < 0x1000 && ok; j += 256 ) \
            { \
                res_c   = pixel_c  .name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
                res_asm = pixel_asm.name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
                if( res_c != res_asm ) \
                { \
                    ok = 0; \
                    fprintf( stderr, #name "[%d]: overflow %d != %d\n", i, res_c, res_asm ); \
                } \
            } \
        } \
    } \
    report( "pixel " #name " :" );

    TEST_PIXEL( sad, 0 );
    TEST_PIXEL( sad_aligned, 1 );
    TEST_PIXEL( ssd, 1 );
    TEST_PIXEL( satd, 0 );
    TEST_PIXEL( sa8d, 1 );

#define TEST_PIXEL_X( N ) \
    ok = 1; used_asm = 0; \
    for( int i = 0; i < 7; i++ ) \
    { \
        int res_c[4]={0}, res_asm[4]={0}; \
        if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
        { \
            set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \
            used_asm = 1; \
            for( int j = 0; j < 64; j++ ) \
            { \
                pixel *pix2 = pbuf2+j; \
                res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \
                res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \
                res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \
                if( N == 4 ) \
                { \
                    res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \
                    call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
                } \
                else \
                    call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
                if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
                { \
                    ok = 0; \
                    fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
                             i, res_c[0], res_c[1], res_c[2], res_c[3], \
                             res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
                } \
                if( N == 4 ) \
                    call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
                else \
                    call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
            } \
        } \
    } \
    report( "pixel sad_x"#N" :" );

    TEST_PIXEL_X(3);
    TEST_PIXEL_X(4);

#define TEST_PIXEL_VAR( i ) \
    if( pixel_asm.var[i] != pixel_ref.var[i] ) \
    { \
        set_func_name( "%s_%s", "var", pixel_names[i] ); \
        used_asm = 1; \
        /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
        call_c1( pixel_c.var[i], pbuf1, 16 ); \
        call_a1( pixel_asm.var[i], pbuf1, 16 ); \
        uint64_t res_c   = pixel_c.var[i]( pbuf1, 16 ); \
        uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \
        if( res_c != res_asm ) \
        { \
            ok = 0; \
            fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
        } \
        call_c2( pixel_c.var[i], pbuf1, 16 ); \
        call_a2( pixel_asm.var[i], pbuf1, 16 ); \
    }

    ok = 1; used_asm = 0;
    TEST_PIXEL_VAR( PIXEL_16x16 );
    TEST_PIXEL_VAR( PIXEL_8x16 );
    TEST_PIXEL_VAR( PIXEL_8x8 );
    report( "pixel var :" );

#define TEST_PIXEL_VAR2( i ) \
    if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
    { \
        int res_c, res_asm, ssd_c, ssd_asm; \
        set_func_name( "%s_%s", "var2", pixel_names[i] ); \
        used_asm = 1; \
        res_c   = call_c( pixel_c.var2[i], pbuf1, 16, pbuf2, 16, &ssd_c ); \
        res_asm = call_a( pixel_asm.var2[i], pbuf1, 16, pbuf2, 16, &ssd_asm ); \
        if( res_c != res_asm || ssd_c != ssd_asm ) \
        { \
            ok = 0; \
            fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \
        } \
    }

    ok = 1; used_asm = 0;
    TEST_PIXEL_VAR2( PIXEL_8x16 );
    TEST_PIXEL_VAR2( PIXEL_8x8 );
    report( "pixel var2 :" );

    ok = 1; used_asm = 0;
    for( int i = 0; i < 4; i++ )
        if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
        {
            set_func_name( "hadamard_ac_%s", pixel_names[i] );
            used_asm = 1;
            for( int j = 0; j < 32; j++ )
            {
                pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256;
                call_c1( pixel_c.hadamard_ac[i], pbuf1, 16 );
                call_a1( pixel_asm.hadamard_ac[i], pbuf1, 16 );
                uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
                uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
                if( rc != ra )
                {
                    ok = 0;
                    fprintf( stderr, "hadamard_ac[%d]: %d,%d != %d,%d\n", i, (int)rc, (int)(rc>>32), (int)ra, (int)(ra>>32) );
                    break;
                }
            }
            call_c2( pixel_c.hadamard_ac[i], pbuf1, 16 );
            call_a2( pixel_asm.hadamard_ac[i], pbuf1, 16 );
        }
    report( "pixel hadamard_ac :" );

    ok = 1; used_asm = 0;
    if( pixel_asm.vsad != pixel_ref.vsad )
    {
        for( int h = 2; h <= 32; h += 2 )
        {
            int res_c, res_asm;
            set_func_name( "vsad" );
            used_asm = 1;
            res_c   = call_c( pixel_c.vsad,   pbuf1, 16, h );
            res_asm = call_a( pixel_asm.vsad, pbuf1, 16, h );
            if( res_c != res_asm )
            {
                ok = 0;
                fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm );
                break;
            }
        }
    }
    report( "pixel vsad :" );

#define TEST_INTRA_X3( name, i8x8, ... ) \
    if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
    { \
        int res_c[3], res_asm[3]; \
        set_func_name( #name ); \
        used_asm = 1; \
        call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \
        call_a( pixel_asm.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_asm ); \
        if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
        { \
            ok = 0; \
            fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \
                     res_c[0], res_c[1], res_c[2], \
                     res_asm[0], res_asm[1], res_asm[2] ); \
        } \
    }

#define TEST_INTRA_X9( name, cmp ) \
    if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
    { \
        set_func_name( #name ); \
        used_asm = 1; \
        ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
        for( int i=0; i<17; i++ ) \
            bitcosts[i] = 9*(i!=8); \
        memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
        memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
        for( int i=0; i<32; i++ ) \
        { \
            pixel *fenc = pbuf1+48+i*12; \
            pixel *fdec1 = pbuf3+48+i*12; \
            pixel *fdec2 = pbuf4+48+i*12; \
            int pred_mode = i%9; \
            int res_c = INT_MAX; \
            for( int j=0; j<9; j++ ) \
            { \
                predict_4x4[j]( fdec1 ); \
                int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
                if( cost < (uint16_t)res_c ) \
                    res_c = cost + (j<<16); \
            } \
            predict_4x4[res_c>>16]( fdec1 ); \
            int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \
            if( res_c != res_a ) \
            { \
                ok = 0; \
                fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
                break; \
            } \
            if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \
            { \
                ok = 0; \
                fprintf( stderr, #name" [FAILED]\n" ); \
                for( int j=0; j<16; j++ ) \
                    fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \
                fprintf( stderr, "\n" ); \
                for( int j=0; j<16; j++ ) \
                    fprintf( stderr, "%02x ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \
                fprintf( stderr, "\n" ); \
                break; \
            } \
        } \
    }

#define TEST_INTRA8_X9( name, cmp ) \
    if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
    { \
        set_func_name( #name ); \
        used_asm = 1; \
        ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
        ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \
        ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \
        memset( satds_c, 0, 16 * sizeof(*satds_c) ); \
        memset( satds_a, 0, 16 * sizeof(*satds_a) ); \
        for( int i=0; i<17; i++ ) \
            bitcosts[i] = 9*(i!=8); \
        for( int i=0; i<32; i++ ) \
        { \
            pixel *fenc = pbuf1+48+i*12; \
            pixel *fdec1 = pbuf3+48+i*12; \
            pixel *fdec2 = pbuf4+48+i*12; \
            int pred_mode = i%9; \
            int res_c = INT_MAX; \
            predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \
            for( int j=0; j<9; j++ ) \
            { \
                predict_8x8[j]( fdec1, edge ); \
                satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
                if( satds_c[j] < (uint16_t)res_c ) \
                    res_c = satds_c[j] + (j<<16); \
            } \
            predict_8x8[res_c>>16]( fdec1, edge ); \
            int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
            if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \
            { \
                ok = 0; \
                fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
                for( int j = 0; j < 9; j++ ) \
                    fprintf( stderr, "%5d ", satds_c[j]); \
                fprintf( stderr, "\n" ); \
                for( int j = 0; j < 9; j++ ) \
                    fprintf( stderr, "%5d ", satds_a[j]); \
                fprintf( stderr, "\n" ); \
                break; \
            } \
            for( int j=0; j<8; j++ ) \
                if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \
                    ok = 0; \
            if( !ok ) \
            { \
                fprintf( stderr, #name" [FAILED]\n" ); \
                for( int j=0; j<8; j++ ) \
                { \
                    for( int k=0; k<8; k++ ) \
                        fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \
                    fprintf( stderr, "\n" ); \
                } \
                fprintf( stderr, "\n" ); \
                for( int j=0; j<8; j++ ) \
                { \
                    for( int k=0; k<8; k++ ) \
                        fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \
                    fprintf( stderr, "\n" ); \
                } \
                fprintf( stderr, "\n" ); \
                break; \
            } \
        } \
    }

    memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) );
    ok = 1; used_asm = 0;
    TEST_INTRA_X3( intra_satd_x3_16x16, 0 );
    TEST_INTRA_X3( intra_satd_x3_8x16c, 0 );
    TEST_INTRA_X3( intra_satd_x3_8x8c, 0 );
    TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge );
    TEST_INTRA_X3( intra_satd_x3_4x4, 0 );
    report( "intra satd_x3 :" );
    ok = 1; used_asm = 0;
    TEST_INTRA_X3( intra_sad_x3_16x16, 0 );
    TEST_INTRA_X3( intra_sad_x3_8x16c, 0 );
    TEST_INTRA_X3( intra_sad_x3_8x8c, 0 );
    TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge );
    TEST_INTRA_X3( intra_sad_x3_4x4, 0 );
    report( "intra sad_x3 :" );
    ok = 1; used_asm = 0;
    TEST_INTRA_X9( intra_satd_x9_4x4, satd );
    TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d );
    report( "intra satd_x9 :" );
    ok = 1; used_asm = 0;
    TEST_INTRA_X9( intra_sad_x9_4x4, sad );
    TEST_INTRA8_X9( intra_sad_x9_8x8, sad );
    report( "intra sad_x9 :" );

    ok = 1; used_asm = 0;
    if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core )
    {
        used_asm = 1;
        set_func_name( "ssd_nv12" );
        uint64_t res_u_c, res_v_c, res_u_a, res_v_a;
        pixel_c.ssd_nv12_core(   pbuf1, 368, pbuf2, 368, 360, 8, &res_u_c, &res_v_c );
        pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8, &res_u_a, &res_v_a );
        if( res_u_c != res_u_a || res_v_c != res_v_a )
        {
            ok = 0;
            fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n",
                     res_u_c, res_v_c, res_u_a, res_v_a );
        }
        call_c( pixel_c.ssd_nv12_core,   pbuf1, 368, pbuf2, 368, 360, 8, &res_u_c, &res_v_c );
        call_a( pixel_asm.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8, &res_u_a, &res_v_a );
    }
    report( "ssd_nv12 :" );

    if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
        pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
    {
        int cnt;
        float res_c, res_a;
        ALIGNED_16( int sums[5][4] ) = {{0}};
        used_asm = ok = 1;
        x264_emms();
        res_c = x264_pixel_ssim_wxh( &pixel_c,   pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt );
        res_a = x264_pixel_ssim_wxh( &pixel_asm, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt );
        if( fabs( res_c - res_a ) > 1e-6 )
        {
            ok = 0;
            fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
        }
        set_func_name( "ssim_core" );
        call_c2( pixel_c.ssim_4x4x2_core,   pbuf1+2, 32, pbuf2+2, 32, sums );
        call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, 32, pbuf2+2, 32, sums );
        set_func_name( "ssim_end" );
        call_c2( pixel_c.ssim_end4,   sums, sums, 4 );
        call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
        report( "ssim :" );
    }

    ok = 1; used_asm = 0;
    for( int i = 0; i < 32; i++ )
        cost_mv[i] = i*10;
    for( int i = 0; i < 100 && ok; i++ )
        if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
        {
            ALIGNED_16( uint16_t sums[72] );
            ALIGNED_16( int dc[4] );
            ALIGNED_16( int16_t mvs_a[32] );
            ALIGNED_16( int16_t mvs_c[32] );
            int mvn_a, mvn_c;
            int thresh = rand() & 0x3fff;
            set_func_name( "esa_ads" );
            for( int j = 0; j < 72; j++ )
                sums[j] = rand() & 0x3fff;
            for( int j = 0; j < 4; j++ )
                dc[j] = rand() & 0x3fff;
            used_asm = 1;
            mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh );
            mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh );
            if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
            {
                ok = 0;
                printf( "c%d: ", i&3 );
                for( int j = 0; j < mvn_c; j++ )
                    printf( "%d ", mvs_c[j] );
                printf( "\na%d: ", i&3 );
                for( int j = 0; j < mvn_a; j++ )
                    printf( "%d ", mvs_a[j] );
                printf( "\n\n" );
            }
        }
    report( "esa ads:" );

    return ret;
}

static int check_dct( int cpu_ref, int cpu_new )
{
    x264_dct_function_t dct_c;
    x264_dct_function_t dct_ref;
    x264_dct_function_t dct_asm;
    x264_quant_function_t qf;
    int ret = 0, ok, used_asm, interlace = 0;
    ALIGNED_16( dctcoef dct1[16][16] );
    ALIGNED_16( dctcoef dct2[16][16] );
    ALIGNED_16( dctcoef dct4[16][16] );
    ALIGNED_16( dctcoef dct8[4][64] );
    ALIGNED_16( dctcoef dctdc[2][8] );
    x264_t h_buf;
    x264_t *h = &h_buf;

    x264_dct_init( 0, &dct_c );
    x264_dct_init( cpu_ref, &dct_ref);
    x264_dct_init( cpu_new, &dct_asm );

    memset( h, 0, sizeof(*h) );
    x264_param_default( &h->param );
    h->sps->i_chroma_format_idc = 1;
    h->chroma_qp_table = i_chroma_qp_table + 12;
    h->param.analyse.i_luma_deadzone[0] = 0;
    h->param.analyse.i_luma_deadzone[1] = 0;
    h->param.analyse.b_transform_8x8 = 1;
    for( int i = 0; i < 6; i++ )
        h->pps->scaling_list[i] = x264_cqm_flat16;
    x264_cqm_init( h );
    x264_quant_init( h, 0, &qf );

    /* overflow test cases */
    for( int i = 0; i < 5; i++ )
    {
        pixel *enc = &pbuf3[16*i*FENC_STRIDE];
        pixel *dec = &pbuf4[16*i*FDEC_STRIDE];

        for( int j = 0; j < 16; j++ )
        {
            int cond_a = (i < 2) ? 1 : ((j&3) == 0 || (j&3) == (i-1));
            int cond_b = (i == 0) ? 1 : !cond_a;
            enc[0] = enc[1] = cond_a ? PIXEL_MAX : 0;
            enc[2] = enc[3] = cond_b ? PIXEL_MAX : 0;

            for( int k = 0; k < 4; k++ )
                dec[k] = PIXEL_MAX - enc[k];

            enc += FENC_STRIDE;
            dec += FDEC_STRIDE;
        }
    }

#define TEST_DCT( name, t1, t2, size ) \
    if( dct_asm.name != dct_ref.name ) \
    { \
        set_func_name( #name ); \
        used_asm = 1; \
        pixel *enc = pbuf3; \
        pixel *dec = pbuf4; \
        for( int j = 0; j < 5; j++) \
        { \
            call_c( dct_c.name, t1, &pbuf1[j*64], &pbuf2[j*64] ); \
            call_a( dct_asm.name, t2, &pbuf1[j*64], &pbuf2[j*64] ); \
            if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
            { \
                ok = 0; \
                fprintf( stderr, #name " [FAILED]\n" ); \
                break; \
            } \
            call_c( dct_c.name, t1, enc, dec ); \
            call_a( dct_asm.name, t2, enc, dec ); \
            if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
            { \
                ok = 0; \
                fprintf( stderr, #name " [FAILED] (overflow)\n" ); \
                break; \
            } \
            enc += 16*FENC_STRIDE; \
            dec += 16*FDEC_STRIDE; \
        } \
    }
    ok = 1; used_asm = 0;
    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
    TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
    TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
    TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 );
    TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
    report( "sub_dct4 :" );

    ok = 1; used_asm = 0;
    TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64 );
    TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*4 );
    report( "sub_dct8 :" );
#undef TEST_DCT

    // fdct and idct are denormalized by different factors, so quant/dequant
    // is needed to force the coefs into the right range.
    dct_c.sub16x16_dct( dct4, pbuf1, pbuf2 );
    dct_c.sub16x16_dct8( dct8, pbuf1, pbuf2 );
    for( int i = 0; i < 16; i++ )
    {
        qf.quant_4x4( dct4[i], h->quant4_mf[CQM_4IY][20], h->quant4_bias[CQM_4IY][20] );
        qf.dequant_4x4( dct4[i], h->dequant4_mf[CQM_4IY], 20 );
    }
    for( int i = 0; i < 4; i++ )
    {
        qf.quant_8x8( dct8[i], h->quant8_mf[CQM_8IY][20], h->quant8_bias[CQM_8IY][20] );
        qf.dequant_8x8( dct8[i], h->dequant8_mf[CQM_8IY], 20 );
    }
    x264_cqm_delete( h );

#define TEST_IDCT( name, src ) \
    if( dct_asm.name != dct_ref.name ) \
    { \
        set_func_name( #name ); \
        used_asm = 1; \
        memcpy( pbuf3, pbuf1, 32*32 * sizeof(pixel) ); \
        memcpy( pbuf4, pbuf1, 32*32 * sizeof(pixel) ); \
        memcpy( dct1, src, 256 * sizeof(dctcoef) ); \
        memcpy( dct2, src, 256 * sizeof(dctcoef) ); \
        call_c1( dct_c.name, pbuf3, (void*)dct1 ); \
        call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \
        if( memcmp( pbuf3, pbuf4, 32*32 * sizeof(pixel) ) ) \
        { \
            ok = 0; \
            fprintf( stderr, #name " [FAILED]\n" ); \
        } \
        call_c2( dct_c.name, pbuf3, (void*)dct1 ); \
        call_a2( dct_asm.name, pbuf4, (void*)dct2 ); \
    }
    ok = 1; used_asm = 0;
    TEST_IDCT( add4x4_idct, dct4 );
    TEST_IDCT( add8x8_idct, dct4 );
    TEST_IDCT( add8x8_idct_dc, dct4 );
    TEST_IDCT( add16x16_idct, dct4 );
    TEST_IDCT( add16x16_idct_dc, dct4 );
    report( "add_idct4 :" );

    ok = 1; used_asm = 0;
    TEST_IDCT( add8x8_idct8, dct8 );
    TEST_IDCT( add16x16_idct8, dct8 );
    report( "add_idct8 :" );
#undef TEST_IDCT

#define TEST_DCTDC( name )\
    ok = 1; used_asm = 0;\
    if( dct_asm.name != dct_ref.name )\
    {\
        set_func_name( #name );\
        used_asm = 1;\
        uint16_t *p = (uint16_t*)buf1;\
        for( int i = 0; i < 16 && ok; i++ )\
        {\
            for( int j = 0; j < 16; j++ )\
                dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
                           : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
                           : ((*p++)&0x1fff)-0x1000; /* general case */\
            memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
            call_c1( dct_c.name, dct1[0] );\
            call_a1( dct_asm.name, dct2[0] );\
            if( memcmp( dct1, dct2, 16 * sizeof(dctcoef) ) )\
                ok = 0;\
        }\
        call_c2( dct_c.name, dct1[0] );\
        call_a2( dct_asm.name, dct2[0] );\
    }\
    report( #name " :" );

    TEST_DCTDC(  dct4x4dc );
    TEST_DCTDC( idct4x4dc );
#undef TEST_DCTDC

#define TEST_DCTDC_CHROMA( name )\
    ok = 1; used_asm = 0;\
    if( dct_asm.name != dct_ref.name )\
    {\
        set_func_name( #name );\
        used_asm = 1;\
        uint16_t *p = (uint16_t*)buf1;\
        for( int i = 0; i < 16 && ok; i++ )\
        {\
            for( int j = 0; j < 8; j++ )\
                dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
                           : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
                           : ((*p++)&0x1fff)-0x1000; /* general case */\
            memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\
            call_c1( dct_c.name, dctdc[0], dct1 );\
            call_a1( dct_asm.name, dctdc[1], dct2 );\
            if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\
            {\
                ok = 0;\
                fprintf( stderr, #name " [FAILED]\n" ); \
            }\
        }\
        call_c2( dct_c.name, dctdc[0], dct1 );\
        call_a2( dct_asm.name, dctdc[1], dct2 );\
    }\
    report( #name " :" );

    TEST_DCTDC_CHROMA( dct2x4dc );
#undef TEST_DCTDC_CHROMA

    x264_zigzag_function_t zigzag_c[2];
    x264_zigzag_function_t zigzag_ref[2];
    x264_zigzag_function_t zigzag_asm[2];

    ALIGNED_16( dctcoef level1[64] );
    ALIGNED_16( dctcoef level2[64] );

#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
    if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
    { \
        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
        used_asm = 1; \
        memcpy(dct, buf1, size*sizeof(dctcoef)); \
        call_c( zigzag_c[interlace].name, t1, dct ); \
        call_a( zigzag_asm[interlace].name, t2, dct ); \
        if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
        { \
            ok = 0; \
            fprintf( stderr, #name " [FAILED]\n" ); \
        } \
    }

#define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
    if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
    { \
        int nz_a, nz_c; \
        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
        used_asm = 1; \
        memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
        memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
        nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \
        nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \
        if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \
        { \
            ok = 0; \
            fprintf( stderr, #name " [FAILED]\n" ); \
        } \
        call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \
        call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \
    }

#define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \
    if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
    { \
        int nz_a, nz_c; \
        dctcoef dc_a, dc_c; \
        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
        used_asm = 1; \
        for( int i = 0; i < 2; i++ ) \
        { \
            memcpy( pbuf3, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
            memcpy( pbuf4, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
            for( int j = 0; j < 4; j++ ) \
            { \
                memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
                memcpy( pbuf4 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
            } \
            nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \
            nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \
            if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
            { \
                ok = 0; \
                fprintf( stderr, #name " [FAILED]\n" ); \
                break; \
            } \
        } \
        call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \
        call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \
    }

#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
    if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
    { \
        for( int j = 0; j < 100; j++ ) \
        { \
            set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
            used_asm = 1; \
            memcpy(dct, buf1, size*sizeof(dctcoef)); \
            for( int i = 0; i < size; i++ ) \
                dct[i] = rand()&0x1F ? 0 : dct[i]; \
            memcpy(buf3, buf4, 10); \
            call_c( zigzag_c[interlace].name, t1, dct, buf3 ); \
            call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \
            if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \
            { \
                ok = 0; \
            } \
        } \
    }

    x264_zigzag_init( 0, &zigzag_c[0], &zigzag_c[1] );
    x264_zigzag_init( cpu_ref, &zigzag_ref[0], &zigzag_ref[1] );
    x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] );

    ok = 1; used_asm = 0;
    TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 );
    report( "zigzag_interleave :" );

    for( interlace = 0; interlace <= 1; interlace++ )
    {
        ok = 1; used_asm = 0;
        TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
        TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16  );
        TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
        TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
        report( interlace ? "zigzag_field :" : "zigzag_frame :" );
    }
#undef TEST_ZIGZAG_SCAN
#undef TEST_ZIGZAG_SUB

    return ret;
}

static int check_mc( int cpu_ref, int cpu_new )
{
    x264_mc_functions_t mc_c;
    x264_mc_functions_t mc_ref;
    x264_mc_functions_t mc_a;
    x264_pixel_function_t pixf;

    pixel *src     = &(pbuf1)[2*64+2];
    pixel *src2[4] = { &(pbuf1)[3*64+2], &(pbuf1)[5*64+2],
                       &(pbuf1)[7*64+2], &(pbuf1)[9*64+2] };
    pixel *dst1    = pbuf3;
    pixel *dst2    = pbuf4;

    int ret = 0, ok, used_asm;

    x264_mc_init( 0, &mc_c );
    x264_mc_init( cpu_ref, &mc_ref );
    x264_mc_init( cpu_new, &mc_a );
    x264_pixel_init( 0, &pixf );

#define MC_TEST_LUMA( w, h ) \
        if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
        { \
            const x264_weight_t *weight = x264_weight_none; \
            set_func_name( "mc_luma_%dx%d", w, h ); \
            used_asm = 1; \
            for( int i = 0; i < 1024; i++ ) \
                pbuf3[i] = pbuf4[i] = 0xCD; \
            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
            call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \
            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
            { \
                fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
                ok = 0; \
            } \
        } \
        if( mc_a.get_ref != mc_ref.get_ref ) \
        { \
            pixel *ref = dst2; \
            int ref_stride = 32; \
            int w_checked = ( ( sizeof(pixel) == 2 && (w == 12 || w == 20)) ? w-2 : w ); \
            const x264_weight_t *weight = x264_weight_none; \
            set_func_name( "get_ref_%dx%d", w_checked, h ); \
            used_asm = 1; \
            for( int i = 0; i < 1024; i++ ) \
                pbuf3[i] = pbuf4[i] = 0xCD; \
            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
            ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
            for( int i = 0; i < h; i++ ) \
                if( memcmp( dst1+i*32, ref+i*ref_stride, w_checked * sizeof(pixel) ) ) \
                { \
                    fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w_checked, h ); \
                    ok = 0; \
                    break; \
                } \
        }

#define MC_TEST_CHROMA( w, h ) \
        if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
        { \
            set_func_name( "mc_chroma_%dx%d", w, h ); \
            used_asm = 1; \
            for( int i = 0; i < 1024; i++ ) \
                pbuf3[i] = pbuf4[i] = 0xCD; \
            call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \
            call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \
            /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
            for( int j = 0; j < h; j++ ) \
                for( int i = w; i < 8; i++ ) \
                { \
                    dst2[i+j*16+8] = dst1[i+j*16+8]; \
                    dst2[i+j*16] = dst1[i+j*16]; \
                } \
            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
            { \
                fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
                ok = 0; \
            } \
        }
    ok = 1; used_asm = 0;
    for( int dy = -8; dy < 8; dy++ )
        for( int dx = -128; dx < 128; dx++ )
        {
            if( rand()&15 ) continue; // running all of them is too slow
            MC_TEST_LUMA( 20, 18 );
            MC_TEST_LUMA( 16, 16 );
            MC_TEST_LUMA( 16, 8 );
            MC_TEST_LUMA( 12, 10 );
            MC_TEST_LUMA( 8, 16 );
            MC_TEST_LUMA( 8, 8 );
            MC_TEST_LUMA( 8, 4 );
            MC_TEST_LUMA( 4, 8 );
            MC_TEST_LUMA( 4, 4 );
        }
    report( "mc luma :" );

    ok = 1; used_asm = 0;
    for( int dy = -1; dy < 9; dy++ )
        for( int dx = -128; dx < 128; dx++ )
        {
            if( rand()&15 ) continue;
            MC_TEST_CHROMA( 8, 8 );
            MC_TEST_CHROMA( 8, 4 );
            MC_TEST_CHROMA( 4, 8 );
            MC_TEST_CHROMA( 4, 4 );
            MC_TEST_CHROMA( 4, 2 );
            MC_TEST_CHROMA( 2, 4 );
            MC_TEST_CHROMA( 2, 2 );
        }
    report( "mc chroma :" );
#undef MC_TEST_LUMA
#undef MC_TEST_CHROMA

#define MC_TEST_AVG( name, weight ) \
{ \
    ok = 1, used_asm = 0; \
    for( int i = 0; i < 12; i++ ) \
    { \
        memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
        memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
        if( mc_a.name[i] != mc_ref.name[i] ) \
        { \
            set_func_name( "%s_%s", #name, pixel_names[i] ); \
            used_asm = 1; \
            call_c1( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
            call_a1( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
            if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \
            { \
                ok = 0; \
                fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
            } \
            call_c2( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
            call_a2( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
        } \
    } \
}

    for( int w = -63; w <= 127 && ok; w++ )
        MC_TEST_AVG( avg, w );
    report( "mc wpredb :" );

#define MC_TEST_WEIGHT( name, weight, aligned ) \
    int align_off = (aligned ? 0 : rand()%16); \
    ok = 1, used_asm = 0; \
    for( int i = 1; i <= 5; i++ ) \
    { \
        ALIGNED_16( pixel buffC[640] ); \
        ALIGNED_16( pixel buffA[640] ); \
        int j = X264_MAX( i*4, 2 ); \
        memset( buffC, 0, 640 * sizeof(pixel) ); \
        memset( buffA, 0, 640 * sizeof(pixel) ); \
        x264_t ha; \
        ha.mc = mc_a; \
        /* w12 is the same as w16 in some cases */ \
        if( i == 3 && mc_a.name[i] == mc_a.name[i+1] ) \
            continue; \
        if( mc_a.name[i] != mc_ref.name[i] ) \
        { \
            set_func_name( "%s_w%d", #name, j ); \
            used_asm = 1; \
            call_c1( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \
            mc_a.weight_cache(&ha, &weight); \
            call_a1( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \
            for( int k = 0; k < 16; k++ ) \
                if( memcmp( &buffC[k*32], &buffA[k*32], j * sizeof(pixel) ) ) \
                { \
                    ok = 0; \
                    fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
                    break; \
                } \
            call_c2( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \
            call_a2( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \
        } \
    }

    ok = 1; used_asm = 0;

    int align_cnt = 0;
    for( int s = 0; s <= 127 && ok; s++ )
    {
        for( int o = -128; o <= 127 && ok; o++ )
        {
            if( rand() & 2047 ) continue;
            for( int d = 0; d <= 7 && ok; d++ )
            {
                if( s == 1<<d )
                    continue;
                x264_weight_t weight = { .i_scale = s, .i_denom = d, .i_offset = o };
                MC_TEST_WEIGHT( weight, weight, (align_cnt++ % 4) );
            }
        }

    }
    report( "mc weight :" );

    ok = 1; used_asm = 0;
    for( int o = 0; o <= 127 && ok; o++ )
    {
        int s = 1, d = 0;
        if( rand() & 15 ) continue;
        x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
        MC_TEST_WEIGHT( offsetadd, weight, (align_cnt++ % 4) );
    }
    report( "mc offsetadd :" );
    ok = 1; used_asm = 0;
    for( int o = -128; o < 0 && ok; o++ )
    {
        int s = 1, d = 0;
        if( rand() & 15 ) continue;
        x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
        MC_TEST_WEIGHT( offsetsub, weight, (align_cnt++ % 4) );
    }
    report( "mc offsetsub :" );

    ok = 1; used_asm = 0;
    for( int height = 8; height <= 16; height += 8 )
    {
        if( mc_a.store_interleave_chroma != mc_ref.store_interleave_chroma )
        {
            set_func_name( "store_interleave_chroma" );
            used_asm = 1;
            memset( pbuf3, 0, 64*height );
            memset( pbuf4, 0, 64*height );
            call_c( mc_c.store_interleave_chroma, pbuf3, 64, pbuf1, pbuf1+16, height );
            call_a( mc_a.store_interleave_chroma, pbuf4, 64, pbuf1, pbuf1+16, height );
            if( memcmp( pbuf3, pbuf4, 64*height ) )
            {
                ok = 0;
                fprintf( stderr, "store_interleave_chroma FAILED: h=%d\n", height );
                break;
            }
        }
        if( mc_a.load_deinterleave_chroma_fenc != mc_ref.load_deinterleave_chroma_fenc )
        {
            set_func_name( "load_deinterleave_chroma_fenc" );
            used_asm = 1;
            call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, 64, height );
            call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, 64, height );
            if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
            {
                ok = 0;
                fprintf( stderr, "load_deinterleave_chroma_fenc FAILED: h=%d\n", height );
                break;
            }
        }
        if( mc_a.load_deinterleave_chroma_fdec != mc_ref.load_deinterleave_chroma_fdec )
        {
            set_func_name( "load_deinterleave_chroma_fdec" );
            used_asm = 1;
            call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, 64, height );
            call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, 64, height );
            if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
            {
                ok = 0;
                fprintf( stderr, "load_deinterleave_chroma_fdec FAILED: h=%d\n", height );
                break;
            }
        }
    }
    report( "store_interleave :" );

    struct plane_spec {
        int w, h, src_stride;
    } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} };
    ok = 1; used_asm = 0;
    if( mc_a.plane_copy != mc_ref.plane_copy )
    {
        set_func_name( "plane_copy" );
        used_asm = 1;
        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
        {
            int w = plane_specs[i].w;
            int h = plane_specs[i].h;
            int src_stride = plane_specs[i].src_stride;
            int dst_stride = (w + 127) & ~63;
            assert( dst_stride * h <= 0x1000 );
            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
            memset( pbuf3, 0, 0x1000*sizeof(pixel) );
            memset( pbuf4, 0, 0x1000*sizeof(pixel) );
            call_c( mc_c.plane_copy, pbuf3, dst_stride, src1, src_stride, w, h );
            call_a( mc_a.plane_copy, pbuf4, dst_stride, src1, src_stride, w, h );
            for( int y = 0; y < h; y++ )
                if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) )
                {
                    ok = 0;
                    fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
                    break;
                }
        }
    }

    if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave )
    {
        set_func_name( "plane_copy_interleave" );
        used_asm = 1;
        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
        {
            int w = (plane_specs[i].w + 1) >> 1;
            int h = plane_specs[i].h;
            int src_stride = (plane_specs[i].src_stride + 1) >> 1;
            int dst_stride = (2*w + 127) & ~63;
            assert( dst_stride * h <= 0x1000 );
            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
            memset( pbuf3, 0, 0x1000*sizeof(pixel) );
            memset( pbuf4, 0, 0x1000*sizeof(pixel) );
            call_c( mc_c.plane_copy_interleave, pbuf3, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
            call_a( mc_a.plane_copy_interleave, pbuf4, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
            for( int y = 0; y < h; y++ )
                if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
                {
                    ok = 0;
                    fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
                    break;
                }
        }
    }

    if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave )
    {
        set_func_name( "plane_copy_deinterleave" );
        used_asm = 1;
        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
        {
            int w = (plane_specs[i].w + 1) >> 1;
            int h = plane_specs[i].h;
            int dst_stride = w;
            int src_stride = (2*w + 127) & ~63;
            int offv = (dst_stride*h + 31) & ~15;
            memset( pbuf3, 0, 0x1000 );
            memset( pbuf4, 0, 0x1000 );
            call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h );
            call_a( mc_a.plane_copy_deinterleave, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf1, src_stride, w, h );
            for( int y = 0; y < h; y++ )
                if( memcmp( pbuf3+y*dst_stride,      pbuf4+y*dst_stride, w ) ||
                    memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) )
                {
                    ok = 0;
                    fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
                    break;
                }
        }
    }
    report( "plane_copy :" );

    if( mc_a.hpel_filter != mc_ref.hpel_filter )
    {
        pixel *srchpel = pbuf1+8+2*64;
        pixel *dstc[3] = { pbuf3+8, pbuf3+8+16*64, pbuf3+8+32*64 };
        pixel *dsta[3] = { pbuf4+8, pbuf4+8+16*64, pbuf4+8+32*64 };
        void *tmp = pbuf3+49*64;
        set_func_name( "hpel_filter" );
        ok = 1; used_asm = 1;
        memset( pbuf3, 0, 4096 * sizeof(pixel) );
        memset( pbuf4, 0, 4096 * sizeof(pixel) );
        call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, 64, 48, 10, tmp );
        call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, 64, 48, 10, tmp );
        for( int i = 0; i < 3; i++ )
            for( int j = 0; j < 10; j++ )
                //FIXME ideally the first pixels would match too, but they aren't actually used
                if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 43 * sizeof(pixel) ) )
                {
                    ok = 0;
                    fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j );
                    for( int k = 0; k < 48; k++ )
                        printf( "%02x%s", dstc[i][j*64+k], (k+1)&3 ? "" : " " );
                    printf( "\n" );
                    for( int k = 0; k < 48; k++ )
                        printf( "%02x%s", dsta[i][j*64+k], (k+1)&3 ? "" : " " );
                    printf( "\n" );
                    break;
                }
        report( "hpel filter :" );
    }

    if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
    {
        pixel *dstc[4] = { pbuf3, pbuf3+1024, pbuf3+2048, pbuf3+3072 };
        pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 };
        set_func_name( "lowres_init" );
        ok = 1; used_asm = 1;
        for( int w = 40; w <= 48; w += 8 )
        {
            int stride = (w+8)&~15;
            call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
            call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
            for( int i = 0; i < 16; i++ )
            {
                for( int j = 0; j < 4; j++ )
                    if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w * sizeof(pixel) ) )
                    {
                        ok = 0;
                        fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
                        for( int k = 0; k < w; k++ )
                            printf( "%d ", dstc[j][k+i*stride] );
                        printf( "\n" );
                        for( int k = 0; k < w; k++ )
                            printf( "%d ", dsta[j][k+i*stride] );
                        printf( "\n" );
                        break;
                    }
            }
        }
        report( "lowres init :" );
    }

#define INTEGRAL_INIT( name, size, ... )\
    if( mc_a.name != mc_ref.name )\
    {\
        int stride = 80;\
        set_func_name( #name );\
        used_asm = 1;\
        memcpy( buf3, buf1, size*2*stride );\
        memcpy( buf4, buf1, size*2*stride );\
        uint16_t *sum = (uint16_t*)buf3;\
        call_c1( mc_c.name, __VA_ARGS__ );\
        sum = (uint16_t*)buf4;\
        call_a1( mc_a.name, __VA_ARGS__ );\
        if( memcmp( buf3, buf4, (stride-8)*2 ) \
            || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
            ok = 0;\
        call_c2( mc_c.name, __VA_ARGS__ );\
        call_a2( mc_a.name, __VA_ARGS__ );\
    }
    ok = 1; used_asm = 0;
    INTEGRAL_INIT( integral_init4h, 2, sum+stride, pbuf2, stride );
    INTEGRAL_INIT( integral_init8h, 2, sum+stride, pbuf2, stride );
    INTEGRAL_INIT( integral_init4v, 14, sum, sum+9*stride, stride );
    INTEGRAL_INIT( integral_init8v, 9, sum, stride );
    report( "integral init :" );

    if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
    {
        x264_emms();
        for( int i = 0; i < 10; i++ )
        {
            float fps_factor = (rand()&65535) / 256.;
            ok = 1; used_asm = 1;
            set_func_name( "mbtree_propagate" );
            int *dsta = (int*)buf3;
            int *dstc = dsta+400;
            uint16_t *prop = (uint16_t*)buf1;
            uint16_t *intra = (uint16_t*)buf4;
            uint16_t *inter = intra+128;
            uint16_t *qscale = inter+128;
            uint16_t *rnd = (uint16_t*)buf2;
            x264_emms();
            for( int j = 0; j < 100; j++ )
            {
                intra[j]  = *rnd++ & 0x7fff;
                intra[j] += !intra[j];
                inter[j]  = *rnd++ & 0x7fff;
                qscale[j] = *rnd++ & 0x7fff;
            }
            call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 );
            call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
            // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
            x264_emms();
            for( int j = 0; j < 100 && ok; j++ )
            {
                ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
                if( !ok )
                    fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
            }
        }
        report( "mbtree propagate :" );
    }

    if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
    {
        set_func_name( "memcpy_aligned" );
        ok = 1; used_asm = 1;
        for( int size = 16; size < 256; size += 16 )
        {
            memset( buf4, 0xAA, size + 1 );
            call_c( mc_c.memcpy_aligned, buf3, buf1, size );
            call_a( mc_a.memcpy_aligned, buf4, buf1, size );
            if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
            {
                ok = 0;
                fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size );
                break;
            }
        }
        report( "memcpy aligned :" );
    }

    if( mc_a.memzero_aligned != mc_ref.memzero_aligned )
    {
        set_func_name( "memzero_aligned" );
        ok = 1; used_asm = 1;
        for( int size = 128; size < 1024; size += 128 )
        {
            memset( buf4, 0xAA, size + 1 );
            call_c( mc_c.memzero_aligned, buf3, size );
            call_a( mc_a.memzero_aligned, buf4, size );
            if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
            {
                ok = 0;
                fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size );
                break;
            }
        }
        report( "memzero aligned :" );
    }

    return ret;
}

static int check_deblock( int cpu_ref, int cpu_new )
{
    x264_deblock_function_t db_c;
    x264_deblock_function_t db_ref;
    x264_deblock_function_t db_a;
    int ret = 0, ok = 1, used_asm = 0;
    int alphas[36], betas[36];
    int8_t tcs[36][4];

    x264_deblock_init( 0, &db_c, 0 );
    x264_deblock_init( cpu_ref, &db_ref, 0 );
    x264_deblock_init( cpu_new, &db_a, 0 );

    /* not exactly the real values of a,b,tc but close enough */
    for( int i = 35, a = 255, c = 250; i >= 0; i-- )
    {
        alphas[i] = a << (BIT_DEPTH-8);
        betas[i] = (i+1)/2 << (BIT_DEPTH-8);
        tcs[i][0] = tcs[i][3] = (c+6)/10 << (BIT_DEPTH-8);
        tcs[i][1] = (c+7)/15 << (BIT_DEPTH-8);
        tcs[i][2] = (c+9)/20 << (BIT_DEPTH-8);
        a = a*9/10;
        c = c*9/10;
    }

#define TEST_DEBLOCK( name, align, ... ) \
    for( int i = 0; i < 36; i++ ) \
    { \
        int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
        for( int j = 0; j < 1024; j++ ) \
            /* two distributions of random to excersize different failure modes */ \
            pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \
        memcpy( pbuf4, pbuf3, 1024 * sizeof(pixel) ); \
        if( db_a.name != db_ref.name ) \
        { \
            set_func_name( #name ); \
            used_asm = 1; \
            call_c1( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
            call_a1( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
            { \
                ok = 0; \
                fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
                break; \
            } \
            call_c2( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
            call_a2( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
        } \
    }

    TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] );
    TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
    TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
    TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
    TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
    TEST_DEBLOCK( deblock_luma_intra[0], 0 );
    TEST_DEBLOCK( deblock_luma_intra[1], 1 );
    TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
    TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
    TEST_DEBLOCK( deblock_chroma_intra[1], 1 );

    if( db_a.deblock_strength != db_ref.deblock_strength )
    {
        for( int i = 0; i < 100; i++ )
        {
            ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
            ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
            ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
            ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][8][4] );
            memset( bs, 99, sizeof(bs) );
            for( int j = 0; j < X264_SCAN8_SIZE; j++ )
                nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
            for( int j = 0; j < 2; j++ )
                for( int k = 0; k < X264_SCAN8_LUMA_SIZE; k++ )
                {
                    ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2;
                    for( int l = 0; l < 2; l++ )
                        mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512;
                }
            set_func_name( "deblock_strength" );
            call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
            call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
            if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) )
            {
                ok = 0;
                fprintf( stderr, "deblock_strength: [FAILED]\n" );
                for( int j = 0; j < 2; j++ )
                {
                    for( int k = 0; k < 2; k++ )
                        for( int l = 0; l < 4; l++ )
                        {
                            for( int m = 0; m < 4; m++ )
                                printf("%d ",bs[j][k][l][m]);
                            printf("\n");
                        }
                    printf("\n");
                }
                break;
            }
        }
    }

    report( "deblock :" );

    return ret;
}

static int check_quant( int cpu_ref, int cpu_new )
{
    x264_quant_function_t qf_c;
    x264_quant_function_t qf_ref;
    x264_quant_function_t qf_a;
    ALIGNED_16( dctcoef dct1[64] );
    ALIGNED_16( dctcoef dct2[64] );
    ALIGNED_16( dctcoef dct3[8][16] );
    ALIGNED_16( dctcoef dct4[8][16] );
    ALIGNED_16( uint8_t cqm_buf[64] );
    int ret = 0, ok, used_asm;
    int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
    x264_t h_buf;
    x264_t *h = &h_buf;
    memset( h, 0, sizeof(*h) );
    h->sps->i_chroma_format_idc = 1;
    x264_param_default( &h->param );
    h->chroma_qp_table = i_chroma_qp_table + 12;
    h->param.analyse.b_transform_8x8 = 1;

    for( int i_cqm = 0; i_cqm < 4; i_cqm++ )
    {
        if( i_cqm == 0 )
        {
            for( int i = 0; i < 6; i++ )
                h->pps->scaling_list[i] = x264_cqm_flat16;
            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_FLAT;
        }
        else if( i_cqm == 1 )
        {
            for( int i = 0; i < 6; i++ )
                h->pps->scaling_list[i] = x264_cqm_jvt[i];
            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_JVT;
        }
        else
        {
            int max_scale = BIT_DEPTH < 10 ? 255 : 228;
            if( i_cqm == 2 )
                for( int i = 0; i < 64; i++ )
                    cqm_buf[i] = 10 + rand() % (max_scale - 9);
            else
                for( int i = 0; i < 64; i++ )
                    cqm_buf[i] = 1;
            for( int i = 0; i < 6; i++ )
                h->pps->scaling_list[i] = cqm_buf;
            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_CUSTOM;
        }

        h->param.rc.i_qp_min = 0;
        h->param.rc.i_qp_max = QP_MAX;
        x264_cqm_init( h );
        x264_quant_init( h, 0, &qf_c );
        x264_quant_init( h, cpu_ref, &qf_ref );
        x264_quant_init( h, cpu_new, &qf_a );

#define INIT_QUANT8(j) \
        { \
            static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
            for( int i = 0; i < 64; i++ ) \
            { \
                unsigned int scale = (255*scale1d[i>>3]*scale1d[i&7])/16; \
                dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
            } \
        }

#define INIT_QUANT4(j) \
        { \
            static const int scale1d[4] = {4,6,4,6}; \
            for( int i = 0; i < 16; i++ ) \
            { \
                unsigned int scale = 255*scale1d[i>>2]*scale1d[i&3]; \
                dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
            } \
        }

#define TEST_QUANT_DC( name, cqm ) \
        if( qf_a.name != qf_ref.name ) \
        { \
            set_func_name( #name ); \
            used_asms[0] = 1; \
            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
            { \
                for( int j = 0; j < 2; j++ ) \
                { \
                    int result_c, result_a; \
                    for( int i = 0; i < 16; i++ ) \
                        dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
                    result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                    result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                    if( memcmp( dct1, dct2, 16*sizeof(dctcoef) ) || result_c != result_a ) \
                    { \
                        oks[0] = 0; \
                        fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
                        break; \
                    } \
                    call_c2( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                    call_a2( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                } \
            } \
        }

#define TEST_QUANT( qname, block, w ) \
        if( qf_a.qname != qf_ref.qname ) \
        { \
            set_func_name( #qname ); \
            used_asms[0] = 1; \
            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
            { \
                for( int j = 0; j < 2; j++ ) \
                { \
                    INIT_QUANT##w(j) \
                    int result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                    int result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                    if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \
                    { \
                        oks[0] = 0; \
                        fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                        break; \
                    } \
                    call_c2( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                    call_a2( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                } \
            } \
        }

        TEST_QUANT( quant_8x8, CQM_8IY, 8 );
        TEST_QUANT( quant_8x8, CQM_8PY, 8 );
        TEST_QUANT( quant_4x4, CQM_4IY, 4 );
        TEST_QUANT( quant_4x4, CQM_4PY, 4 );
        TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
        TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );

#define TEST_DEQUANT( qname, dqname, block, w ) \
        if( qf_a.dqname != qf_ref.dqname ) \
        { \
            set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
            used_asms[1] = 1; \
            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
            { \
                INIT_QUANT##w(1) \
                qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
                if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
                { \
                    oks[1] = 0; \
                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                    break; \
                } \
                call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
            } \
        }

        TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
        TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
        TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
        TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );

#define TEST_DEQUANT_DC( qname, dqname, block, w ) \
        if( qf_a.dqname != qf_ref.dqname ) \
        { \
            set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
            used_asms[1] = 1; \
            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
            { \
                for( int i = 0; i < 16; i++ ) \
                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
                qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
                memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
                if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
                { \
                    oks[1] = 0; \
                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                } \
                call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
            } \
        }

        TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );

        if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc )
        {
            set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
            used_asms[1] = 1;
            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- )
            {
                for( int i = 0; i < 8; i++ )
                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
                qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
                qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
                call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 );
                call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 );
                for( int i = 0; i < 8; i++ )
                    if( dct3[i][0] != dct4[i][0] )
                    {
                        oks[1] = 0;
                        fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
                        break;
                    }
            }
        }

        if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly )
        {
            set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
            used_asms[1] = 1;
            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- )
            {
                for( int i = 0; i < 8; i++ )
                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
                qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
                qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
                memcpy( dct2, dct1, 8*sizeof(dctcoef) );
                call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
                call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
                if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) )
                {
                    oks[1] = 0;
                    fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
                    break;
                }
                call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
                call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
            }
        }

#define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \
        if( qf_a.optname != qf_ref.optname ) \
        { \
            set_func_name( #optname ); \
            used_asms[2] = 1; \
            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
            { \
                int qpdc = qp + (size == 8 ? 3 : 0); \
                int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \
                if( dmf > 32*64 ) \
                    continue; \
                for( int i = 16; ; i <<= 1 ) \
                { \
                    int res_c, res_asm; \
                    int max = X264_MIN( i, PIXEL_MAX*16 ); \
                    for( int j = 0; j < size; j++ ) \
                        dct1[j] = rand()%(max*2+1) - max; \
                    for( int j = 0; i <= size; j += 4 ) \
                        qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
                    memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
                    res_c   = call_c1( qf_c.optname, dct1, dmf ); \
                    res_asm = call_a1( qf_a.optname, dct2, dmf ); \
                    if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \
                    { \
                        oks[2] = 0; \
                        fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \
                    } \
                    call_c2( qf_c.optname, dct1, dmf ); \
                    call_a2( qf_a.optname, dct2, dmf ); \
                    if( i >= PIXEL_MAX*16 ) \
                        break; \
                } \
            } \
        }

        TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 );
        TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 );

        x264_cqm_delete( h );
    }

    ok = oks[0]; used_asm = used_asms[0];
    report( "quant :" );

    ok = oks[1]; used_asm = used_asms[1];
    report( "dequant :" );

    ok = oks[2]; used_asm = used_asms[2];
    report( "optimize chroma dc :" );

    ok = 1; used_asm = 0;
    if( qf_a.denoise_dct != qf_ref.denoise_dct )
    {
        used_asm = 1;
        for( int size = 16; size <= 64; size += 48 )
        {
            set_func_name( "denoise_dct" );
            memcpy( dct1, buf1, size*sizeof(dctcoef) );
            memcpy( dct2, buf1, size*sizeof(dctcoef) );
            memcpy( buf3+256, buf3, 256 );
            call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
            call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
            if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
                ok = 0;
            call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
            call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
        }
    }
    report( "denoise dct :" );

#define TEST_DECIMATE( decname, w, ac, thresh ) \
    if( qf_a.decname != qf_ref.decname ) \
    { \
        set_func_name( #decname ); \
        used_asm = 1; \
        for( int i = 0; i < 100; i++ ) \
        { \
            static const int distrib[16] = {1,1,1,1,1,1,1,1,1,1,1,1,2,3,4};\
            static const int zerorate_lut[4] = {3,7,15,31};\
            int zero_rate = zerorate_lut[i&3];\
            for( int idx = 0; idx < w*w; idx++ ) \
            { \
                int sign = (rand()&1) ? -1 : 1; \
                int abs_level = distrib[rand()&15]; \
                if( abs_level == 4 ) abs_level = rand()&0x3fff; \
                int zero = !(rand()&zero_rate); \
                dct1[idx] = zero * abs_level * sign; \
            } \
            if( ac ) \
                dct1[0] = 0; \
            int result_c = call_c( qf_c.decname, dct1 ); \
            int result_a = call_a( qf_a.decname, dct1 ); \
            if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
            { \
                ok = 0; \
                fprintf( stderr, #decname ": [FAILED]\n" ); \
                break; \
            } \
        } \
    }

    ok = 1; used_asm = 0;
    TEST_DECIMATE( decimate_score64, 8, 0, 6 );
    TEST_DECIMATE( decimate_score16, 4, 0, 6 );
    TEST_DECIMATE( decimate_score15, 4, 1, 7 );
    report( "decimate_score :" );

#define TEST_LAST( last, lastname, size, ac ) \
    if( qf_a.last != qf_ref.last ) \
    { \
        set_func_name( #lastname ); \
        used_asm = 1; \
        for( int i = 0; i < 100; i++ ) \
        { \
            int nnz = 0; \
            int max = rand() & (size-1); \
            memset( dct1, 0, size*sizeof(dctcoef) ); \
            for( int idx = ac; idx < max; idx++ ) \
                nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
            if( !nnz ) \
                dct1[ac] = 1; \
            int result_c = call_c( qf_c.last, dct1+ac ); \
            int result_a = call_a( qf_a.last, dct1+ac ); \
            if( result_c != result_a ) \
            { \
                ok = 0; \
                fprintf( stderr, #lastname ": [FAILED]\n" ); \
                break; \
            } \
        } \
    }

    ok = 1; used_asm = 0;
    TEST_LAST( coeff_last4              , coeff_last4,   4, 0 );
    TEST_LAST( coeff_last8              , coeff_last8,   8, 0 );
    TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 16, 1 );
    TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 );
    TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 );
    report( "coeff_last :" );

#define TEST_LEVELRUN( lastname, name, size, ac ) \
    if( qf_a.lastname != qf_ref.lastname ) \
    { \
        set_func_name( #name ); \
        used_asm = 1; \
        for( int i = 0; i < 100; i++ ) \
        { \
            x264_run_level_t runlevel_c, runlevel_a; \
            int nnz = 0; \
            int max = rand() & (size-1); \
            memset( dct1, 0, size*sizeof(dctcoef) ); \
            memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
            memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
            for( int idx = ac; idx < max; idx++ ) \
                nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
            if( !nnz ) \
                dct1[ac] = 1; \
            int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
            int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
            if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
                memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
                memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
            { \
                ok = 0; \
                fprintf( stderr, #name ": [FAILED]\n" ); \
                break; \
            } \
        } \
    }

    ok = 1; used_asm = 0;
    TEST_LEVELRUN( coeff_level_run4              , coeff_level_run4,   4, 0 );
    TEST_LEVELRUN( coeff_level_run8              , coeff_level_run8,   8, 0 );
    TEST_LEVELRUN( coeff_level_run[  DCT_LUMA_AC], coeff_level_run15, 16, 1 );
    TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 );
    report( "coeff_level_run :" );

    return ret;
}

static int check_intra( int cpu_ref, int cpu_new )
{
    int ret = 0, ok = 1, used_asm = 0;
    ALIGNED_ARRAY_32( pixel, edge,[36] );
    ALIGNED_ARRAY_32( pixel, edge2,[36] );
    ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
    struct
    {
        x264_predict_t      predict_16x16[4+3];
        x264_predict_t      predict_8x8c[4+3];
        x264_predict_t      predict_8x16c[4+3];
        x264_predict8x8_t   predict_8x8[9+3];
        x264_predict_t      predict_4x4[9+3];
        x264_predict_8x8_filter_t predict_8x8_filter;
    } ip_c, ip_ref, ip_a;

    x264_predict_16x16_init( 0, ip_c.predict_16x16 );
    x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
    x264_predict_8x16c_init( 0, ip_c.predict_8x16c );
    x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter );
    x264_predict_4x4_init( 0, ip_c.predict_4x4 );

    x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
    x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
    x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c );
    x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter );
    x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );

    x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
    x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
    x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c );
    x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
    x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );

    memcpy( fdec, pbuf1, 32*20 * sizeof(pixel) );\

    ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );

#define INTRA_TEST( name, dir, w, h, align, bench, ... )\
    if( ip_a.name[dir] != ip_ref.name[dir] )\
    {\
        set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
        used_asm = 1;\
        memcpy( pbuf3, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
        memcpy( pbuf4, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
        for( int a = 0; a < (do_bench ? 64/sizeof(pixel) : 1); a += align )\
        {\
            call_c##bench( ip_c.name[dir], pbuf3+48+a, ##__VA_ARGS__ );\
            call_a##bench( ip_a.name[dir], pbuf4+48+a, ##__VA_ARGS__ );\
            if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*20 * sizeof(pixel) ) )\
            {\
                fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
                ok = 0;\
                for( int k = -1; k < 16; k++ )\
                    printf( "%2x ", edge[16+k] );\
                printf( "\n" );\
                for( int j = 0; j < h; j++ )\
                {\
                    printf( "%2x ", edge[14-j] );\
                    for( int k = 0; k < w; k++ )\
                        printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\
                    printf( "\n" );\
                }\
                printf( "\n" );\
                for( int j = 0; j < h; j++ )\
                {\
                    printf( "   " );\
                    for( int k = 0; k < w; k++ )\
                        printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\
                    printf( "\n" );\
                }\
                break;\
            }\
        }\
    }

    for( int i = 0; i < 12; i++ )
        INTRA_TEST(   predict_4x4, i,  4,  4,  4, );
    for( int i = 0; i < 7; i++ )
        INTRA_TEST(  predict_8x8c, i,  8,  8, 16, );
    for( int i = 0; i < 7; i++ )
        INTRA_TEST( predict_8x16c, i,  8, 16, 16, );
    for( int i = 0; i < 7; i++ )
        INTRA_TEST( predict_16x16, i, 16, 16, 16, );
    for( int i = 0; i < 12; i++ )
        INTRA_TEST(   predict_8x8, i,  8,  8,  8, , edge );

    set_func_name("intra_predict_8x8_filter");
    if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
    {
        used_asm = 1;
        for( int i = 0; i < 32; i++ )
        {
            if( !(i&7) || ((i&MB_TOPRIGHT) && !(i&MB_TOP)) )
                continue;
            int neighbor = (i&24)>>1;
            memset( edge,  0, 36*sizeof(pixel) );
            memset( edge2, 0, 36*sizeof(pixel) );
            call_c( ip_c.predict_8x8_filter, pbuf1+48, edge,  neighbor, i&7 );
            call_a( ip_a.predict_8x8_filter, pbuf1+48, edge2, neighbor, i&7 );
            if( !(neighbor&MB_TOPLEFT) )
                edge[15] = edge2[15] = 0;
            if( memcmp( edge+7, edge2+7, (i&MB_TOPRIGHT ? 26 : i&MB_TOP ? 17 : 8) * sizeof(pixel) ) )
            {
                fprintf( stderr, "predict_8x8_filter :  [FAILED] %d %d\n", (i&24)>>1, i&7);
                ok = 0;
            }
        }
    }

#define EXTREMAL_PLANE( w, h ) \
    { \
        int max[7]; \
        for( int j = 0; j < 7; j++ ) \
            max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
        fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
        for( int j = 0; j < w/2; j++ ) \
            fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
        for( int j = w/2; j < w-1; j++ ) \
            fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
        fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
        for( int j = 0; j < h/2; j++ ) \
            fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
        for( int j = h/2; j < h-1; j++ ) \
            fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
        fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
    }
    /* Extremal test case for planar prediction. */
    for( int test = 0; test < 100 && ok; test++ )
        for( int i = 0; i < 128 && ok; i++ )
        {
            EXTREMAL_PLANE(  8,  8 );
            INTRA_TEST(  predict_8x8c, I_PRED_CHROMA_P,  8,  8, 64, 1 );
            EXTREMAL_PLANE(  8, 16 );
            INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P,  8, 16, 64, 1 );
            EXTREMAL_PLANE( 16, 16 );
            INTRA_TEST( predict_16x16,  I_PRED_16x16_P, 16, 16, 64, 1 );
        }
    report( "intra pred :" );
    return ret;
}

#define DECL_CABAC(cpu) \
static void run_cabac_decision_##cpu( x264_t *h, uint8_t *dst )\
{\
    x264_cabac_t cb;\
    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
    for( int i = 0; i < 0x1000; i++ )\
        x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
}\
static void run_cabac_bypass_##cpu( x264_t *h, uint8_t *dst )\
{\
    x264_cabac_t cb;\
    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
    for( int i = 0; i < 0x1000; i++ )\
        x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\
}\
static void run_cabac_terminal_##cpu( x264_t *h, uint8_t *dst )\
{\
    x264_cabac_t cb;\
    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
    for( int i = 0; i < 0x1000; i++ )\
        x264_cabac_encode_terminal_##cpu( &cb );\
}
DECL_CABAC(c)
#if HAVE_MMX
DECL_CABAC(asm)
#else
#define run_cabac_decision_asm run_cabac_decision_c
#define run_cabac_bypass_asm run_cabac_bypass_c
#define run_cabac_terminal_asm run_cabac_terminal_c
#endif

static int check_cabac( int cpu_ref, int cpu_new )
{
    int ret = 0, ok, used_asm = 1;
    x264_t h;
    h.sps->i_chroma_format_idc = 3;
    if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
        return 0;
    x264_cabac_init( &h );

    set_func_name( "cabac_encode_decision" );
    memcpy( buf4, buf3, 0x1000 );
    call_c( run_cabac_decision_c, &h, buf3 );
    call_a( run_cabac_decision_asm, &h, buf4 );
    ok = !memcmp( buf3, buf4, 0x1000 );
    report( "cabac decision:" );

    set_func_name( "cabac_encode_bypass" );
    memcpy( buf4, buf3, 0x1000 );
    call_c( run_cabac_bypass_c, &h, buf3 );
    call_a( run_cabac_bypass_asm, &h, buf4 );
    ok = !memcmp( buf3, buf4, 0x1000 );
    report( "cabac bypass:" );

    set_func_name( "cabac_encode_terminal" );
    memcpy( buf4, buf3, 0x1000 );
    call_c( run_cabac_terminal_c, &h, buf3 );
    call_a( run_cabac_terminal_asm, &h, buf4 );
    ok = !memcmp( buf3, buf4, 0x1000 );
    report( "cabac terminal:" );

    return ret;
}

static int check_bitstream( int cpu_ref, int cpu_new )
{
    x264_bitstream_function_t bs_c;
    x264_bitstream_function_t bs_ref;
    x264_bitstream_function_t bs_a;

    int ret = 0, ok = 1, used_asm = 0;

    x264_bitstream_init( 0, &bs_c );
    x264_bitstream_init( cpu_ref, &bs_ref );
    x264_bitstream_init( cpu_new, &bs_a );
    if( bs_a.nal_escape != bs_ref.nal_escape )
    {
        int size = 0x4000;
        uint8_t *input = malloc(size+100);
        uint8_t *output1 = malloc(size*2);
        uint8_t *output2 = malloc(size*2);
        used_asm = 1;
        set_func_name( "nal_escape" );
        for( int i = 0; i < 100; i++ )
        {
            /* Test corner-case sizes */
            int test_size = i < 10 ? i+1 : rand() & 0x3fff;
            /* Test 8 different probability distributions of zeros */
            for( int j = 0; j < test_size+32; j++ )
                input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand();
            uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
            uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
            int size_c = end_c-output1;
            int size_a = end_a-output2;
            if( size_c != size_a || memcmp( output1, output2, size_c ) )
            {
                fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
                ok = 0;
                break;
            }
        }
        for( int j = 0; j < size+32; j++ )
            input[j] = rand();
        call_c2( bs_c.nal_escape, output1, input, input+size );
        call_a2( bs_a.nal_escape, output2, input, input+size );
        free(input);
        free(output1);
        free(output2);
    }
    report( "nal escape:" );

    return ret;
}

static int check_all_funcs( int cpu_ref, int cpu_new )
{
    return check_pixel( cpu_ref, cpu_new )
         + check_dct( cpu_ref, cpu_new )
         + check_mc( cpu_ref, cpu_new )
         + check_intra( cpu_ref, cpu_new )
         + check_deblock( cpu_ref, cpu_new )
         + check_quant( cpu_ref, cpu_new )
         + check_cabac( cpu_ref, cpu_new )
         + check_bitstream( cpu_ref, cpu_new );
}

static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
{
    *cpu_ref = *cpu_new;
    *cpu_new |= flags;
#if BROKEN_STACK_ALIGNMENT
    *cpu_new |= X264_CPU_STACK_MOD4;
#endif
    if( *cpu_new & X264_CPU_SSE2_IS_FAST )
        *cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
    if( !quiet )
        fprintf( stderr, "x264: %s\n", name );
    return check_all_funcs( *cpu_ref, *cpu_new );
}

static int check_all_flags( void )
{
    int ret = 0;
    int cpu0 = 0, cpu1 = 0;
#if HAVE_MMX
    if( x264_cpu_detect() & X264_CPU_MMX2 )
    {
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" );
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
        cpu1 &= ~X264_CPU_CACHELINE_64;
#if ARCH_X86
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
        cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
        if( x264_cpu_detect() & X264_CPU_LZCNT )
        {
            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
            cpu1 &= ~X264_CPU_LZCNT;
        }
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
        cpu1 &= ~X264_CPU_SLOW_CTZ;
    }
    if( x264_cpu_detect() & X264_CPU_SSE2 )
    {
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
        cpu1 &= ~X264_CPU_CACHELINE_64;
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
        cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
        cpu1 &= ~X264_CPU_SLOW_CTZ;
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
        cpu1 &= ~X264_CPU_SLOW_ATOM;
    }
    if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
    {
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
        cpu1 &= ~X264_CPU_SSE_MISALIGN;
    }
    if( x264_cpu_detect() & X264_CPU_LZCNT )
    {
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
        cpu1 &= ~X264_CPU_LZCNT;
    }
    if( x264_cpu_detect() & X264_CPU_SSE3 )
    {
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
        cpu1 &= ~X264_CPU_CACHELINE_64;
    }
    if( x264_cpu_detect() & X264_CPU_SSSE3 )
    {
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
        cpu1 &= ~X264_CPU_CACHELINE_64;
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
        cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
        cpu1 &= ~X264_CPU_SLOW_CTZ;
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
        cpu1 &= ~X264_CPU_SLOW_ATOM;
    }
    if( x264_cpu_detect() & X264_CPU_SSE4 )
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
    if( x264_cpu_detect() & X264_CPU_AVX )
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
    if( x264_cpu_detect() & X264_CPU_XOP )
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
    if( x264_cpu_detect() & X264_CPU_FMA4 )
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
#elif ARCH_PPC
    if( x264_cpu_detect() & X264_CPU_ALTIVEC )
    {
        fprintf( stderr, "x264: ALTIVEC against C\n" );
        ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
    }
#elif ARCH_ARM
    if( x264_cpu_detect() & X264_CPU_ARMV6 )
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
    if( x264_cpu_detect() & X264_CPU_NEON )
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
    if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
#endif
    return ret;
}

int main(int argc, char *argv[])
{
    int ret = 0;

    if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
    {
#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM
        fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
        return 1;
#endif
        do_bench = 1;
        if( argv[1][7] == '=' )
        {
            bench_pattern = argv[1]+8;
            bench_pattern_len = strlen(bench_pattern);
        }
        argc--;
        argv++;
    }

    int seed = ( argc > 1 ) ? atoi(argv[1]) : x264_mdate();
    fprintf( stderr, "x264: using random seed %u\n", seed );
    srand( seed );

    buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 16*BENCH_ALIGNS );
    pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 16*BENCH_ALIGNS );
    if( !buf1 || !pbuf1 )
    {
        fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
        return -1;
    }
#define INIT_POINTER_OFFSETS\
    buf2 = buf1 + 0xf00;\
    buf3 = buf2 + 0xf00;\
    buf4 = buf3 + 0x1000*sizeof(pixel);\
    pbuf2 = pbuf1 + 0xf00;\
    pbuf3 = (pixel*)buf3;\
    pbuf4 = (pixel*)buf4;
    INIT_POINTER_OFFSETS;
    for( int i = 0; i < 0x1e00; i++ )
    {
        buf1[i] = rand() & 0xFF;
        pbuf1[i] = rand() & PIXEL_MAX;
    }
    memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );

    /* 16-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
    if( do_bench )
        for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
        {
            INIT_POINTER_OFFSETS;
            ret |= x264_stack_pagealign( check_all_flags, i*16 );
            buf1 += 16;
            pbuf1 += 16;
            quiet = 1;
            fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
        }
    else
        ret = check_all_flags();

    if( ret )
    {
        fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
        return -1;
    }
    fprintf( stderr, "x264: All tests passed Yeah :)\n" );
    if( do_bench )
        print_bench();
    return 0;
}

x264-snapshot-20120103-2245-stable/tools/checkasm-a.asm0000644000175000001440000001070211700673342021527 0ustar  videolanusers;*****************************************************************************
;* checkasm-a.asm: assembly check tool
;*****************************************************************************
;* Copyright (C) 2008-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"

SECTION_RODATA

error_message: db "failed to preserve register", 0

%ifdef WIN64
; just random numbers to reduce the chance of incidental match
ALIGN 16
n4:   dq 0xa77809bf11b239d1
n5:   dq 0x2ba9bf3d2f05b389
x6:  ddq 0x79445c159ce790641a1b2550a612b48c
x7:  ddq 0x86b2536fcd8cf6362eed899d5a28ddcd
x8:  ddq 0x3f2bf84fc0fcca4eb0856806085e7943
x9:  ddq 0xd229e1f5b281303facbd382dcf5b8de2
x10: ddq 0xab63e2e11fa38ed971aeaff20b095fd9
x11: ddq 0x77d410d5c42c882d89b0c0765892729a
x12: ddq 0x24b3c1d2a024048bc45ea11a955d8dd5
x13: ddq 0xdd7b8919edd427862e8ec680de14b47c
x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf
x15: ddq 0x6de8f4c914c334d5011ff554472a7a10
%endif

SECTION .text

cextern_naked puts

; max number of args used by any x264 asm function.
; (max_args % 4) must equal 3 for stack alignment
%define max_args 11

%ifdef WIN64

;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal checkasm_call, 4,7,16
    sub  rsp, max_args*8
    %assign stack_offset stack_offset+max_args*8
    mov  r6, r0
    mov  [rsp+stack_offset+16], r1
    mov  r0, r2
    mov  r1, r3
    mov r2d, r4m ; FIXME truncates pointer
    mov r3d, r5m ; FIXME truncates pointer
%assign i 4
%rep max_args-4
    mov  r4, [rsp+stack_offset+8+(i+2)*8]
    mov  [rsp+i*8], r4
    %assign i i+1
%endrep
%assign i 6
%rep 16-6
    movdqa xmm %+ i, [x %+ i]
    %assign i i+1
%endrep
    mov  r4, [n4]
    mov  r5, [n5]
    call r6
    xor  r4, [n4]
    xor  r5, [n5]
    or   r4, r5
    pxor xmm5, xmm5
%assign i 6
%rep 16-6
    pxor xmm %+ i, [x %+ i]
    por  xmm5, xmm %+ i
    %assign i i+1
%endrep
    packsswb xmm5, xmm5
    movq r5, xmm5
    or   r4, r5
    jz .ok
    mov  r4, rax
    lea  r0, [error_message]
    call puts
    mov  r1, [rsp+stack_offset+16]
    mov  dword [r1], 0
    mov  rax, r4
.ok:
    add  rsp, max_args*8
    %assign stack_offset stack_offset-max_args*8
    RET

%elifndef ARCH_X86_64

; just random numbers to reduce the chance of incidental match
%define n3 dword 0x6549315c
%define n4 dword 0xe02f3e23
%define n5 dword 0xb78d0d1d
%define n6 dword 0x33627ba7

;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
cglobal checkasm_call, 1,7
    mov  r3, n3
    mov  r4, n4
    mov  r5, n5
    mov  r6, n6
%rep max_args
    push dword [esp+24+max_args*4]
%endrep
    call r0
    add  esp, max_args*4
    xor  r3, n3
    xor  r4, n4
    xor  r5, n5
    xor  r6, n6
    or   r3, r4
    or   r5, r6
    or   r3, r5
    jz .ok
    mov  r3, eax
    lea  r1, [error_message]
    push r1
    call puts
    add  esp, 4
    mov  r1, r1m
    mov  dword [r1], 0
    mov  eax, r3
.ok:
    RET

%endif ; ARCH_X86_64

;-----------------------------------------------------------------------------
; int x264_stack_pagealign( int (*func)(), int align )
;-----------------------------------------------------------------------------
cglobal stack_pagealign, 2,2
    push rbp
    mov  rbp, rsp
    and  rsp, ~0xfff
    sub  rsp, r1
    call r0
    leave
    RET

x264-snapshot-20120103-2245-stable/output/0000755000175000001440000000000011700673342017231 5ustar  videolanusersx264-snapshot-20120103-2245-stable/output/raw.c0000644000175000001440000000441611700673342020173 0ustar  videolanusers/*****************************************************************************
 * raw.c: raw muxer
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "output.h"

static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
{
    if( !strcmp( psz_filename, "-" ) )
        *p_handle = stdout;
    else if( !(*p_handle = fopen( psz_filename, "w+b" )) )
        return -1;

    return 0;
}

static int set_param( hnd_t handle, x264_param_t *p_param )
{
    return 0;
}

static int write_headers( hnd_t handle, x264_nal_t *p_nal )
{
    int size = p_nal[0].i_payload + p_nal[1].i_payload + p_nal[2].i_payload;

    if( fwrite( p_nal[0].p_payload, size, 1, (FILE*)handle ) )
        return size;
    return -1;
}

static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
{
    if( fwrite( p_nalu, i_size, 1, (FILE*)handle ) )
        return i_size;
    return -1;
}

static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
{
    if( !handle || handle == stdout )
        return 0;

    return fclose( (FILE*)handle );
}

const cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };

x264-snapshot-20120103-2245-stable/output/output.h0000644000175000001440000000370111700673342020743 0ustar  videolanusers/*****************************************************************************
 * output.h: x264 file output modules
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_OUTPUT_H
#define X264_OUTPUT_H

#include "x264cli.h"

typedef struct
{
    int use_dts_compress;
} cli_output_opt_t;

typedef struct
{
    int (*open_file)( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt );
    int (*set_param)( hnd_t handle, x264_param_t *p_param );
    int (*write_headers)( hnd_t handle, x264_nal_t *p_nal );
    int (*write_frame)( hnd_t handle, uint8_t *p_nal, int i_size, x264_picture_t *p_picture );
    int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
} cli_output_t;

extern const cli_output_t raw_output;
extern const cli_output_t mkv_output;
extern const cli_output_t mp4_output;
extern const cli_output_t flv_output;

#endif
x264-snapshot-20120103-2245-stable/output/mp4.c0000644000175000001440000003025111700673342020076 0ustar  videolanusers/*****************************************************************************
 * mp4.c: mp4 muxer
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "output.h"
#include <gpac/isomedia.h>

#if HAVE_GF_MALLOC
#undef malloc
#undef free
#undef realloc
#define malloc gf_malloc
#define free gf_free
#define realloc gf_realloc
#endif

typedef struct
{
    GF_ISOFile *p_file;
    GF_AVCConfig *p_config;
    GF_ISOSample *p_sample;
    int i_track;
    uint32_t i_descidx;
    uint64_t i_time_res;
    int64_t i_time_inc;
    int64_t i_delay_time;
    int64_t i_init_delta;
    int i_numframe;
    int i_delay_frames;
    int b_dts_compress;
    int i_dts_compress_multiplier;
    int i_data_size;
} mp4_hnd_t;

static void recompute_bitrate_mp4( GF_ISOFile *p_file, int i_track )
{
    u32 count, di, timescale, time_wnd, rate;
    u64 offset;
    Double br;
    GF_ESD *esd;

    esd = gf_isom_get_esd( p_file, i_track, 1 );
    if( !esd )
        return;

    esd->decoderConfig->avgBitrate = 0;
    esd->decoderConfig->maxBitrate = 0;
    rate = time_wnd = 0;

    timescale = gf_isom_get_media_timescale( p_file, i_track );
    count = gf_isom_get_sample_count( p_file, i_track );
    for( u32 i = 0; i < count; i++ )
    {
        GF_ISOSample *samp = gf_isom_get_sample_info( p_file, i_track, i+1, &di, &offset );
        if( !samp )
        {
            x264_cli_log( "mp4", X264_LOG_ERROR, "failure reading back frame %u\n", i );
            break;
        }

        if( esd->decoderConfig->bufferSizeDB < samp->dataLength )
            esd->decoderConfig->bufferSizeDB = samp->dataLength;

        esd->decoderConfig->avgBitrate += samp->dataLength;
        rate += samp->dataLength;
        if( samp->DTS > time_wnd + timescale )
        {
            if( rate > esd->decoderConfig->maxBitrate )
                esd->decoderConfig->maxBitrate = rate;
            time_wnd = samp->DTS;
            rate = 0;
        }

        gf_isom_sample_del( &samp );
    }

    br = (Double)(s64)gf_isom_get_media_duration( p_file, i_track );
    br /= timescale;
    esd->decoderConfig->avgBitrate = (u32)(esd->decoderConfig->avgBitrate / br);
    /*move to bps*/
    esd->decoderConfig->avgBitrate *= 8;
    esd->decoderConfig->maxBitrate *= 8;

    gf_isom_change_mpeg4_description( p_file, i_track, 1, esd );
    gf_odf_desc_del( (GF_Descriptor*)esd );
}

static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
{
    mp4_hnd_t *p_mp4 = handle;

    if( !p_mp4 )
        return 0;

    if( p_mp4->p_config )
        gf_odf_avc_cfg_del( p_mp4->p_config );

    if( p_mp4->p_sample )
    {
        if( p_mp4->p_sample->data )
            free( p_mp4->p_sample->data );

        p_mp4->p_sample->dataLength = 0;
        gf_isom_sample_del( &p_mp4->p_sample );
    }

    if( p_mp4->p_file )
    {
        if( p_mp4->i_track )
        {
            /* The mdhd duration is defined as CTS[final] - CTS[0] + duration of last frame.
             * The mdhd duration (in seconds) should be able to be longer than the tkhd duration since the track is managed by edts.
             * So, if mdhd duration is equal to the last DTS or less, we give the last composition time delta to the last sample duration.
             * And then, the mdhd duration is updated, but it time-wise doesn't give the actual duration.
             * The tkhd duration is the actual track duration. */
            uint64_t mdhd_duration = (2 * largest_pts - second_largest_pts) * p_mp4->i_time_inc;
            if( mdhd_duration != gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track ) )
            {
                uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
                uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
                gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
            }

            /* Write an Edit Box if the first CTS offset is positive.
             * A media_time is given by not the mvhd timescale but rather the mdhd timescale.
             * The reason is that an Edit Box maps the presentation time-line to the media time-line.
             * Any demuxers should follow the Edit Box if it exists. */
            GF_ISOSample *sample = gf_isom_get_sample_info( p_mp4->p_file, p_mp4->i_track, 1, NULL, NULL );
            if( sample && sample->CTS_Offset > 0 )
            {
                uint32_t mvhd_timescale = gf_isom_get_timescale( p_mp4->p_file );
                uint64_t tkhd_duration = (uint64_t)( mdhd_duration * ( (double)mvhd_timescale / p_mp4->i_time_res ) );
                gf_isom_append_edit_segment( p_mp4->p_file, p_mp4->i_track, tkhd_duration, sample->CTS_Offset, GF_ISOM_EDIT_NORMAL );
            }
            gf_isom_sample_del( &sample );

            recompute_bitrate_mp4( p_mp4->p_file, p_mp4->i_track );
        }
        gf_isom_set_pl_indication( p_mp4->p_file, GF_ISOM_PL_VISUAL, 0x15 );
        gf_isom_set_storage_mode( p_mp4->p_file, GF_ISOM_STORE_FLAT );
        gf_isom_close( p_mp4->p_file );
    }

    free( p_mp4 );

    return 0;
}

static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
{
    mp4_hnd_t *p_mp4;

    *p_handle = NULL;
    FILE *fh = fopen( psz_filename, "w" );
    if( !fh )
        return -1;
    FAIL_IF_ERR( !x264_is_regular_file( fh ), "mp4", "MP4 output is incompatible with non-regular file `%s'\n", psz_filename )
    fclose( fh );

    if( !(p_mp4 = malloc( sizeof(mp4_hnd_t) )) )
        return -1;

    memset( p_mp4, 0, sizeof(mp4_hnd_t) );
    p_mp4->p_file = gf_isom_open( psz_filename, GF_ISOM_OPEN_WRITE, NULL );

    p_mp4->b_dts_compress = opt->use_dts_compress;

    if( !(p_mp4->p_sample = gf_isom_sample_new()) )
    {
        close_file( p_mp4, 0, 0 );
        return -1;
    }

    gf_isom_set_brand_info( p_mp4->p_file, GF_ISOM_BRAND_AVC1, 0 );

    *p_handle = p_mp4;

    return 0;
}

static int set_param( hnd_t handle, x264_param_t *p_param )
{
    mp4_hnd_t *p_mp4 = handle;

    p_mp4->i_delay_frames = p_param->i_bframe ? (p_param->i_bframe_pyramid ? 2 : 1) : 0;
    p_mp4->i_dts_compress_multiplier = p_mp4->b_dts_compress * p_mp4->i_delay_frames + 1;

    p_mp4->i_time_res = p_param->i_timebase_den * p_mp4->i_dts_compress_multiplier;
    p_mp4->i_time_inc = p_param->i_timebase_num * p_mp4->i_dts_compress_multiplier;
    FAIL_IF_ERR( p_mp4->i_time_res > UINT32_MAX, "mp4", "MP4 media timescale %"PRIu64" exceeds maximum\n", p_mp4->i_time_res )

    p_mp4->i_track = gf_isom_new_track( p_mp4->p_file, 0, GF_ISOM_MEDIA_VISUAL,
                                        p_mp4->i_time_res );

    p_mp4->p_config = gf_odf_avc_cfg_new();
    gf_isom_avc_config_new( p_mp4->p_file, p_mp4->i_track, p_mp4->p_config,
                            NULL, NULL, &p_mp4->i_descidx );

    gf_isom_set_track_enabled( p_mp4->p_file, p_mp4->i_track, 1 );

    gf_isom_set_visual_info( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx,
                             p_param->i_width, p_param->i_height );

    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
    {
        uint64_t dw = p_param->i_width << 16;
        uint64_t dh = p_param->i_height << 16;
        double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height;
        if( sar > 1.0 )
            dw *= sar ;
        else
            dh /= sar;
        gf_isom_set_pixel_aspect_ratio( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->vui.i_sar_width, p_param->vui.i_sar_height );
        gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
    }

    p_mp4->i_data_size = p_param->i_width * p_param->i_height * 3 / 2;
    p_mp4->p_sample->data = malloc( p_mp4->i_data_size );
    if( !p_mp4->p_sample->data )
    {
        p_mp4->i_data_size = 0;
        return -1;
    }

    return 0;
}

static int check_buffer( mp4_hnd_t *p_mp4, int needed_size )
{
    if( needed_size > p_mp4->i_data_size )
    {
        void *ptr = realloc( p_mp4->p_sample->data, needed_size );
        if( !ptr )
            return -1;
        p_mp4->p_sample->data = ptr;
        p_mp4->i_data_size = needed_size;
    }
    return 0;
}

static int write_headers( hnd_t handle, x264_nal_t *p_nal )
{
    mp4_hnd_t *p_mp4 = handle;
    GF_AVCConfigSlot *p_slot;

    int sps_size = p_nal[0].i_payload - 4;
    int pps_size = p_nal[1].i_payload - 4;
    int sei_size = p_nal[2].i_payload;

    uint8_t *sps = p_nal[0].p_payload + 4;
    uint8_t *pps = p_nal[1].p_payload + 4;
    uint8_t *sei = p_nal[2].p_payload;

    // SPS

    p_mp4->p_config->configurationVersion = 1;
    p_mp4->p_config->AVCProfileIndication = sps[1];
    p_mp4->p_config->profile_compatibility = sps[2];
    p_mp4->p_config->AVCLevelIndication = sps[3];
    p_slot = malloc( sizeof(GF_AVCConfigSlot) );
    if( !p_slot )
        return -1;
    p_slot->size = sps_size;
    p_slot->data = malloc( p_slot->size );
    if( !p_slot->data )
        return -1;
    memcpy( p_slot->data, sps, sps_size );
    gf_list_add( p_mp4->p_config->sequenceParameterSets, p_slot );

    // PPS

    p_slot = malloc( sizeof(GF_AVCConfigSlot) );
    if( !p_slot )
        return -1;
    p_slot->size = pps_size;
    p_slot->data = malloc( p_slot->size );
    if( !p_slot->data )
        return -1;
    memcpy( p_slot->data, pps, pps_size );
    gf_list_add( p_mp4->p_config->pictureParameterSets, p_slot );
    gf_isom_avc_config_update( p_mp4->p_file, p_mp4->i_track, 1, p_mp4->p_config );

    // SEI

    if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + sei_size ) )
        return -1;
    memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, sei, sei_size );
    p_mp4->p_sample->dataLength += sei_size;

    return sei_size + sps_size + pps_size;
}

static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
{
    mp4_hnd_t *p_mp4 = handle;
    int64_t dts;
    int64_t cts;

    if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + i_size ) )
        return -1;
    memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size );
    p_mp4->p_sample->dataLength += i_size;

    if( !p_mp4->i_numframe )
        p_mp4->i_delay_time = p_picture->i_dts * -1;

    if( p_mp4->b_dts_compress )
    {
        if( p_mp4->i_numframe == 1 )
            p_mp4->i_init_delta = (p_picture->i_dts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
        dts = p_mp4->i_numframe > p_mp4->i_delay_frames
            ? p_picture->i_dts * p_mp4->i_time_inc
            : p_mp4->i_numframe * (p_mp4->i_init_delta / p_mp4->i_dts_compress_multiplier);
        cts = p_picture->i_pts * p_mp4->i_time_inc;
    }
    else
    {
        dts = (p_picture->i_dts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
        cts = (p_picture->i_pts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
    }

    p_mp4->p_sample->IsRAP = p_picture->b_keyframe;
    p_mp4->p_sample->DTS = dts;
    p_mp4->p_sample->CTS_Offset = (uint32_t)(cts - dts);
    gf_isom_add_sample( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_mp4->p_sample );

    p_mp4->p_sample->dataLength = 0;
    p_mp4->i_numframe++;

    return i_size;
}

const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
x264-snapshot-20120103-2245-stable/output/matroska_ebml.h0000644000175000001440000000422211700673342022222 0ustar  videolanusers/*****************************************************************************
 * matroska_ebml.h: matroska muxer utilities
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Mike Matsnev <mike@haali.su>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_MATROSKA_EBML_H
#define X264_MATROSKA_EBML_H

/* Matroska display size units from the spec */
#define	DS_PIXELS        0
#define	DS_CM            1
#define	DS_INCHES        2
#define	DS_ASPECT_RATIO  3

typedef struct mk_writer mk_writer;

mk_writer *mk_create_writer( const char *filename );

int mk_write_header( mk_writer *w, const char *writing_app,
                     const char *codec_id,
                     const void *codec_private, unsigned codec_private_size,
                     int64_t default_frame_duration,
                     int64_t timescale,
                     unsigned width, unsigned height,
                     unsigned d_width, unsigned d_height, int display_size_units );

int mk_start_frame( mk_writer *w );
int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable );
int mk_close( mk_writer *w, int64_t last_delta );

#endif
x264-snapshot-20120103-2245-stable/output/matroska_ebml.c0000644000175000001440000003205311700673342022220 0ustar  videolanusers/*****************************************************************************
 * matroska_ebml.c: matroska muxer utilities
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Mike Matsnev <mike@haali.su>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "output.h"
#include "matroska_ebml.h"

#define CLSIZE 1048576
#define CHECK(x)\
do {\
    if( (x) < 0 )\
        return -1;\
} while( 0 )

struct mk_context
{
    struct mk_context *next, **prev, *parent;
    mk_writer *owner;
    unsigned id;

    void *data;
    unsigned d_cur, d_max;
};

typedef struct mk_context mk_context;

struct mk_writer
{
    FILE *fp;

    unsigned duration_ptr;

    mk_context *root, *cluster, *frame;
    mk_context *freelist;
    mk_context *actlist;

    int64_t def_duration;
    int64_t timescale;
    int64_t cluster_tc_scaled;
    int64_t frame_tc, max_frame_tc;

    char wrote_header, in_frame, keyframe, skippable;
};

static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
{
    mk_context *c;

    if( w->freelist )
    {
        c = w->freelist;
        w->freelist = w->freelist->next;
    }
    else
    {
        c = malloc( sizeof(*c) );
        if( !c )
            return NULL;
        memset( c, 0, sizeof(*c) );
    }

    c->parent = parent;
    c->owner = w;
    c->id = id;

    if( c->owner->actlist )
        c->owner->actlist->prev = &c->next;
    c->next = c->owner->actlist;
    c->prev = &c->owner->actlist;
    c->owner->actlist = c;

    return c;
}

static int mk_append_context_data( mk_context *c, const void *data, unsigned size )
{
    unsigned ns = c->d_cur + size;

    if( ns > c->d_max )
    {
        void *dp;
        unsigned dn = c->d_max ? c->d_max << 1 : 16;
        while( ns > dn )
            dn <<= 1;

        dp = realloc( c->data, dn );
        if( !dp )
            return -1;

        c->data = dp;
        c->d_max = dn;
    }

    memcpy( (char*)c->data + c->d_cur, data, size );

    c->d_cur = ns;

    return 0;
}

static int mk_write_id( mk_context *c, unsigned id )
{
    unsigned char c_id[4] = { id >> 24, id >> 16, id >> 8, id };

    if( c_id[0] )
        return mk_append_context_data( c, c_id, 4 );
    if( c_id[1] )
        return mk_append_context_data( c, c_id+1, 3 );
    if( c_id[2] )
        return mk_append_context_data( c, c_id+2, 2 );
    return mk_append_context_data( c, c_id+3, 1 );
}

static int mk_write_size( mk_context *c, unsigned size )
{
    unsigned char c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size };

    if( size < 0x7f )
    {
        c_size[4] |= 0x80;
        return mk_append_context_data( c, c_size+4, 1 );
    }
    if( size < 0x3fff )
    {
        c_size[3] |= 0x40;
        return mk_append_context_data( c, c_size+3, 2 );
    }
    if( size < 0x1fffff )
    {
        c_size[2] |= 0x20;
        return mk_append_context_data( c, c_size+2, 3 );
    }
    if( size < 0x0fffffff )
    {
        c_size[1] |= 0x10;
        return mk_append_context_data( c, c_size+1, 4 );
    }
    return mk_append_context_data( c, c_size, 5 );
}

static int mk_flush_context_id( mk_context *c )
{
    unsigned char ff = 0xff;

    if( !c->id )
        return 0;

    CHECK( mk_write_id( c->parent, c->id ) );
    CHECK( mk_append_context_data( c->parent, &ff, 1 ) );

    c->id = 0;

    return 0;
}

static int mk_flush_context_data( mk_context *c )
{
    if( !c->d_cur )
        return 0;

    if( c->parent )
        CHECK( mk_append_context_data( c->parent, c->data, c->d_cur ) );
    else if( fwrite( c->data, c->d_cur, 1, c->owner->fp ) != 1 )
        return -1;

    c->d_cur = 0;

    return 0;
}

static int mk_close_context( mk_context *c, unsigned *off )
{
    if( c->id )
    {
        CHECK( mk_write_id( c->parent, c->id ) );
        CHECK( mk_write_size( c->parent, c->d_cur ) );
    }

    if( c->parent && off )
        *off += c->parent->d_cur;

    CHECK( mk_flush_context_data( c ) );

    if( c->next )
        c->next->prev = c->prev;
    *(c->prev) = c->next;
    c->next = c->owner->freelist;
    c->owner->freelist = c;

    return 0;
}

static void mk_destroy_contexts( mk_writer *w )
{
    mk_context *next;

    for( mk_context *cur = w->freelist; cur; cur = next )
    {
        next = cur->next;
        free( cur->data );
        free( cur );
    }

    for( mk_context *cur = w->actlist; cur; cur = next )
    {
        next = cur->next;
        free( cur->data );
        free( cur );
    }

    w->freelist = w->actlist = w->root = NULL;
}

static int mk_write_string( mk_context *c, unsigned id, const char *str )
{
    size_t len = strlen( str );

    CHECK( mk_write_id( c, id ) );
    CHECK( mk_write_size( c, len ) );
    CHECK( mk_append_context_data( c, str, len ) );
    return 0;
}

static int mk_write_bin( mk_context *c, unsigned id, const void *data, unsigned size )
{
    CHECK( mk_write_id( c, id ) );
    CHECK( mk_write_size( c, size ) );
    CHECK( mk_append_context_data( c, data, size ) ) ;
    return 0;
}

static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
{
    unsigned char c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui };
    unsigned i = 0;

    CHECK( mk_write_id( c, id ) );
    while( i < 7 && !c_ui[i] )
        ++i;
    CHECK( mk_write_size( c, 8 - i ) );
    CHECK( mk_append_context_data( c, c_ui+i, 8 - i ) );
    return 0;
}

static int mk_write_float_raw( mk_context *c, float f )
{
    union
    {
        float f;
        unsigned u;
    } u;
    unsigned char c_f[4];

    u.f = f;
    c_f[0] = u.u >> 24;
    c_f[1] = u.u >> 16;
    c_f[2] = u.u >> 8;
    c_f[3] = u.u;

    return mk_append_context_data( c, c_f, 4 );
}

static int mk_write_float( mk_context *c, unsigned id, float f )
{
    CHECK( mk_write_id( c, id ) );
    CHECK( mk_write_size( c, 4 ) );
    CHECK( mk_write_float_raw( c, f ) );
    return 0;
}

mk_writer *mk_create_writer( const char *filename )
{
    mk_writer *w = malloc( sizeof(*w) );
    if( !w )
        return NULL;

    memset( w, 0, sizeof(*w) );

    w->root = mk_create_context( w, NULL, 0 );
    if( !w->root )
    {
        free( w );
        return NULL;
    }

    if( !strcmp( filename, "-" ) )
        w->fp = stdout;
    else
        w->fp = fopen( filename, "wb" );
    if( !w->fp )
    {
        mk_destroy_contexts( w );
        free( w );
        return NULL;
    }

    w->timescale = 1000000;

    return w;
}

int mk_write_header( mk_writer *w, const char *writing_app,
                     const char *codec_id,
                     const void *codec_private, unsigned codec_private_size,
                     int64_t default_frame_duration,
                     int64_t timescale,
                     unsigned width, unsigned height,
                     unsigned d_width, unsigned d_height, int display_size_units )
{
    mk_context  *c, *ti, *v;

    if( w->wrote_header )
        return -1;

    w->timescale = timescale;
    w->def_duration = default_frame_duration;

    if( !(c = mk_create_context( w, w->root, 0x1a45dfa3 )) ) // EBML
        return -1;
    CHECK( mk_write_uint( c, 0x4286, 1 ) ); // EBMLVersion
    CHECK( mk_write_uint( c, 0x42f7, 1 ) ); // EBMLReadVersion
    CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength
    CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength
    CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType
    CHECK( mk_write_uint( c, 0x4287, 2 ) ); // DocTypeVersion
    CHECK( mk_write_uint( c, 0x4285, 2 ) ); // DocTypeReadversion
    CHECK( mk_close_context( c, 0 ) );

    if( !(c = mk_create_context( w, w->root, 0x18538067 )) ) // Segment
        return -1;
    CHECK( mk_flush_context_id( c ) );
    CHECK( mk_close_context( c, 0 ) );

    if( !(c = mk_create_context( w, w->root, 0x1549a966 )) ) // SegmentInfo
        return -1;
    CHECK( mk_write_string( c, 0x4d80, "Haali Matroska Writer b0" ) );
    CHECK( mk_write_string( c, 0x5741, writing_app ) );
    CHECK( mk_write_uint( c, 0x2ad7b1, w->timescale ) );
    CHECK( mk_write_float( c, 0x4489, 0) );
    w->duration_ptr = c->d_cur - 4;
    CHECK( mk_close_context( c, &w->duration_ptr ) );

    if( !(c = mk_create_context( w, w->root, 0x1654ae6b )) ) // tracks
        return -1;
    if( !(ti = mk_create_context( w, c, 0xae )) ) // TrackEntry
        return -1;
    CHECK( mk_write_uint( ti, 0xd7, 1 ) ); // TrackNumber
    CHECK( mk_write_uint( ti, 0x73c5, 1 ) ); // TrackUID
    CHECK( mk_write_uint( ti, 0x83, 1 ) ); // TrackType
    CHECK( mk_write_uint( ti, 0x9c, 0 ) ); // FlagLacing
    CHECK( mk_write_string( ti, 0x86, codec_id ) ); // codec_id
    if( codec_private_size )
        CHECK( mk_write_bin( ti, 0x63a2, codec_private, codec_private_size ) ); // codec_private
    if( default_frame_duration )
        CHECK( mk_write_uint( ti, 0x23e383, default_frame_duration ) ); // DefaultDuration

    if( !(v = mk_create_context( w, ti, 0xe0 ) ) ) // Video
        return -1;
    CHECK( mk_write_uint( v, 0xb0, width ) );
    CHECK( mk_write_uint( v, 0xba, height ) );
    CHECK( mk_write_uint( v, 0x54b2, display_size_units ) );
    CHECK( mk_write_uint( v, 0x54b0, d_width ) );
    CHECK( mk_write_uint( v, 0x54ba, d_height ) );
    CHECK( mk_close_context( v, 0 ) );

    CHECK( mk_close_context( ti, 0 ) );

    CHECK( mk_close_context( c, 0 ) );

    CHECK( mk_flush_context_data( w->root ) );

    w->wrote_header = 1;

    return 0;
}

static int mk_close_cluster( mk_writer *w )
{
    if( w->cluster == NULL )
        return 0;
    CHECK( mk_close_context( w->cluster, 0 ) );
    w->cluster = NULL;
    CHECK( mk_flush_context_data( w->root ) );
    return 0;
}

static int mk_flush_frame( mk_writer *w )
{
    int64_t delta;
    unsigned fsize;
    unsigned char c_delta_flags[3];

    if( !w->in_frame )
        return 0;

    delta = w->frame_tc/w->timescale - w->cluster_tc_scaled;
    if( delta > 32767ll || delta < -32768ll )
        CHECK( mk_close_cluster( w ) );

    if( !w->cluster )
    {
        w->cluster_tc_scaled = w->frame_tc / w->timescale;
        w->cluster = mk_create_context( w, w->root, 0x1f43b675 ); // Cluster
        if( !w->cluster )
            return -1;

        CHECK( mk_write_uint( w->cluster, 0xe7, w->cluster_tc_scaled ) ); // Timecode

        delta = 0;
    }

    fsize = w->frame ? w->frame->d_cur : 0;

    CHECK( mk_write_id( w->cluster, 0xa3 ) ); // SimpleBlock
    CHECK( mk_write_size( w->cluster, fsize + 4 ) );
    CHECK( mk_write_size( w->cluster, 1 ) ); // track number

    c_delta_flags[0] = delta >> 8;
    c_delta_flags[1] = delta;
    c_delta_flags[2] = (w->keyframe << 7) | w->skippable;
    CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) );
    if( w->frame )
    {
        CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) );
        w->frame->d_cur = 0;
    }

    w->in_frame = 0;

    if( w->cluster->d_cur > CLSIZE )
        CHECK( mk_close_cluster( w ) );

    return 0;
}

int mk_start_frame( mk_writer *w )
{
    if( mk_flush_frame( w ) < 0 )
        return -1;

    w->in_frame  = 1;
    w->keyframe  = 0;
    w->skippable = 0;

    return 0;
}

int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable )
{
    if( !w->in_frame )
        return -1;

    w->frame_tc  = timestamp;
    w->keyframe  = keyframe  != 0;
    w->skippable = skippable != 0;

    if( w->max_frame_tc < timestamp )
        w->max_frame_tc = timestamp;

    return 0;
}

int mk_add_frame_data( mk_writer *w, const void *data, unsigned size )
{
    if( !w->in_frame )
        return -1;

    if( !w->frame )
        if( !(w->frame = mk_create_context( w, NULL, 0 )) )
        return -1;

    return mk_append_context_data( w->frame, data, size );
}

int mk_close( mk_writer *w, int64_t last_delta )
{
    int ret = 0;
    if( mk_flush_frame( w ) < 0 || mk_close_cluster( w ) < 0 )
        ret = -1;
    if( w->wrote_header && x264_is_regular_file( w->fp ) )
    {
        fseek( w->fp, w->duration_ptr, SEEK_SET );
        int64_t last_frametime = w->def_duration ? w->def_duration : last_delta;
        int64_t total_duration = w->max_frame_tc+last_frametime;
        if( mk_write_float_raw( w->root, (float)((double)total_duration / w->timescale) ) < 0 ||
            mk_flush_context_data( w->root ) < 0 )
            ret = -1;
    }
    mk_destroy_contexts( w );
    fclose( w->fp );
    free( w );
    return ret;
}
x264-snapshot-20120103-2245-stable/output/matroska.c0000644000175000001440000001353711700673342021227 0ustar  videolanusers/*****************************************************************************
 * matroska.c: matroska muxer
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Mike Matsnev <mike@haali.su>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "output.h"
#include "matroska_ebml.h"

typedef struct
{
    mk_writer *w;

    int width, height, d_width, d_height;

    int display_size_units;

    int64_t frame_duration;

    char b_writing_frame;
    uint32_t i_timebase_num;
    uint32_t i_timebase_den;

} mkv_hnd_t;

static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
{
    mkv_hnd_t *p_mkv;

    *p_handle = NULL;

    p_mkv  = malloc( sizeof(*p_mkv) );
    if( !p_mkv )
        return -1;

    memset( p_mkv, 0, sizeof(*p_mkv) );

    p_mkv->w = mk_create_writer( psz_filename );
    if( !p_mkv->w )
    {
        free( p_mkv );
        return -1;
    }

    *p_handle = p_mkv;

    return 0;
}

static int set_param( hnd_t handle, x264_param_t *p_param )
{
    mkv_hnd_t   *p_mkv = handle;
    int64_t dw, dh;

    if( p_param->i_fps_num > 0 && !p_param->b_vfr_input )
    {
        p_mkv->frame_duration = (int64_t)p_param->i_fps_den *
                                (int64_t)1000000000 / p_param->i_fps_num;
    }
    else
    {
        p_mkv->frame_duration = 0;
    }

    p_mkv->width = p_mkv->d_width = p_param->i_width;
    p_mkv->height = p_mkv->d_height = p_param->i_height;
    p_mkv->display_size_units = DS_PIXELS;

    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height
        && p_param->vui.i_sar_width != p_param->vui.i_sar_height )
    {
        if ( p_param->vui.i_sar_width > p_param->vui.i_sar_height ) {
            dw = (int64_t)p_param->i_width * p_param->vui.i_sar_width / p_param->vui.i_sar_height;
            dh = p_param->i_height;
        } else {
            dw = p_param->i_width;
            dh = (int64_t)p_param->i_height * p_param->vui.i_sar_height / p_param->vui.i_sar_width;
        }

        p_mkv->d_width = (int)dw;
        p_mkv->d_height = (int)dh;
    }

    p_mkv->i_timebase_num = p_param->i_timebase_num;
    p_mkv->i_timebase_den = p_param->i_timebase_den;

    return 0;
}

static int write_headers( hnd_t handle, x264_nal_t *p_nal )
{
    mkv_hnd_t *p_mkv = handle;

    int sps_size = p_nal[0].i_payload - 4;
    int pps_size = p_nal[1].i_payload - 4;
    int sei_size = p_nal[2].i_payload;

    uint8_t *sps = p_nal[0].p_payload + 4;
    uint8_t *pps = p_nal[1].p_payload + 4;
    uint8_t *sei = p_nal[2].p_payload;

    int ret;
    uint8_t *avcC;
    int avcC_len;

    if( !p_mkv->width || !p_mkv->height ||
        !p_mkv->d_width || !p_mkv->d_height )
        return -1;

    avcC_len = 5 + 1 + 2 + sps_size + 1 + 2 + pps_size;
    avcC = malloc( avcC_len );
    if( !avcC )
        return -1;

    avcC[0] = 1;
    avcC[1] = sps[1];
    avcC[2] = sps[2];
    avcC[3] = sps[3];
    avcC[4] = 0xff; // nalu size length is four bytes
    avcC[5] = 0xe1; // one sps

    avcC[6] = sps_size >> 8;
    avcC[7] = sps_size;

    memcpy( avcC+8, sps, sps_size );

    avcC[8+sps_size] = 1; // one pps
    avcC[9+sps_size] = pps_size >> 8;
    avcC[10+sps_size] = pps_size;

    memcpy( avcC+11+sps_size, pps, pps_size );

    ret = mk_write_header( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC",
                           avcC, avcC_len, p_mkv->frame_duration, 50000,
                           p_mkv->width, p_mkv->height,
                           p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units );
    if( ret < 0 )
        return ret;

    free( avcC );

    // SEI

    if( !p_mkv->b_writing_frame )
    {
        if( mk_start_frame( p_mkv->w ) < 0 )
            return -1;
        p_mkv->b_writing_frame = 1;
    }
    if( mk_add_frame_data( p_mkv->w, sei, sei_size ) < 0 )
        return -1;

    return sei_size + sps_size + pps_size;
}

static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
{
    mkv_hnd_t *p_mkv = handle;

    if( !p_mkv->b_writing_frame )
    {
        if( mk_start_frame( p_mkv->w ) < 0 )
            return -1;
        p_mkv->b_writing_frame = 1;
    }

    if( mk_add_frame_data( p_mkv->w, p_nalu, i_size ) < 0 )
        return -1;

    int64_t i_stamp = (int64_t)((p_picture->i_pts * 1e9 * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5);

    p_mkv->b_writing_frame = 0;

    if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe, p_picture->i_type == X264_TYPE_B ) < 0 )
        return -1;

    return i_size;
}

static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
{
    mkv_hnd_t *p_mkv = handle;
    int ret;
    int64_t i_last_delta;

    i_last_delta = p_mkv->i_timebase_den ? (int64_t)(((largest_pts - second_largest_pts) * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5) : 0;

    ret = mk_close( p_mkv->w, i_last_delta );

    free( p_mkv );

    return ret;
}

const cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
x264-snapshot-20120103-2245-stable/output/flv_bytestream.h0000644000175000001440000001002411700673342022425 0ustar  videolanusers/*****************************************************************************
 * flv_bytestream.h: flv muxer utilities
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: Kieran Kunhya <kieran@kunhya.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_FLV_BYTESTREAM_H
#define X264_FLV_BYTESTREAM_H

/* offsets for packed values */
#define FLV_AUDIO_SAMPLESSIZE_OFFSET 1
#define FLV_AUDIO_SAMPLERATE_OFFSET  2
#define FLV_AUDIO_CODECID_OFFSET     4

#define FLV_VIDEO_FRAMETYPE_OFFSET   4

/* bitmasks to isolate specific values */
#define FLV_AUDIO_CHANNEL_MASK    0x01
#define FLV_AUDIO_SAMPLESIZE_MASK 0x02
#define FLV_AUDIO_SAMPLERATE_MASK 0x0c
#define FLV_AUDIO_CODECID_MASK    0xf0

#define FLV_VIDEO_CODECID_MASK    0x0f
#define FLV_VIDEO_FRAMETYPE_MASK  0xf0

#define AMF_END_OF_OBJECT         0x09

enum
{
    FLV_HEADER_FLAG_HASVIDEO = 1,
    FLV_HEADER_FLAG_HASAUDIO = 4,
};

enum
{
    FLV_TAG_TYPE_AUDIO = 0x08,
    FLV_TAG_TYPE_VIDEO = 0x09,
    FLV_TAG_TYPE_META  = 0x12,
};

enum
{
    FLV_MONO   = 0,
    FLV_STEREO = 1,
};

enum
{
    FLV_SAMPLESSIZE_8BIT  = 0,
    FLV_SAMPLESSIZE_16BIT = 1 << FLV_AUDIO_SAMPLESSIZE_OFFSET,
};

enum
{
    FLV_SAMPLERATE_SPECIAL = 0, /**< signifies 5512Hz and 8000Hz in the case of NELLYMOSER */
    FLV_SAMPLERATE_11025HZ = 1 << FLV_AUDIO_SAMPLERATE_OFFSET,
    FLV_SAMPLERATE_22050HZ = 2 << FLV_AUDIO_SAMPLERATE_OFFSET,
    FLV_SAMPLERATE_44100HZ = 3 << FLV_AUDIO_SAMPLERATE_OFFSET,
};

enum
{
    FLV_CODECID_MP3 = 2 << FLV_AUDIO_CODECID_OFFSET,
    FLV_CODECID_AAC = 10<< FLV_AUDIO_CODECID_OFFSET,
};

enum
{
    FLV_CODECID_H264 = 7,
};

enum
{
    FLV_FRAME_KEY   = 1 << FLV_VIDEO_FRAMETYPE_OFFSET | 7,
    FLV_FRAME_INTER = 2 << FLV_VIDEO_FRAMETYPE_OFFSET | 7,
};

typedef enum
{
    AMF_DATA_TYPE_NUMBER      = 0x00,
    AMF_DATA_TYPE_BOOL        = 0x01,
    AMF_DATA_TYPE_STRING      = 0x02,
    AMF_DATA_TYPE_OBJECT      = 0x03,
    AMF_DATA_TYPE_NULL        = 0x05,
    AMF_DATA_TYPE_UNDEFINED   = 0x06,
    AMF_DATA_TYPE_REFERENCE   = 0x07,
    AMF_DATA_TYPE_MIXEDARRAY  = 0x08,
    AMF_DATA_TYPE_OBJECT_END  = 0x09,
    AMF_DATA_TYPE_ARRAY       = 0x0a,
    AMF_DATA_TYPE_DATE        = 0x0b,
    AMF_DATA_TYPE_LONG_STRING = 0x0c,
    AMF_DATA_TYPE_UNSUPPORTED = 0x0d,
} AMFDataType;

typedef struct flv_buffer
{
    uint8_t *data;
    unsigned d_cur;
    unsigned d_max;
    FILE *fp;
    uint64_t d_total;
} flv_buffer;

flv_buffer *flv_create_writer( const char *filename );
int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size );
int flv_write_byte( flv_buffer *c, uint8_t *byte );
int flv_flush_data( flv_buffer *c );
void flv_rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start );

uint64_t flv_dbl2int( double value );
void flv_put_byte( flv_buffer *c, uint8_t b );
void flv_put_be32( flv_buffer *c, uint32_t val );
void flv_put_be64( flv_buffer *c, uint64_t val );
void flv_put_be16( flv_buffer *c, uint16_t val );
void flv_put_be24( flv_buffer *c, uint32_t val );
void flv_put_tag( flv_buffer *c, const char *tag );
void flv_put_amf_string( flv_buffer *c, const char *str );
void flv_put_amf_double( flv_buffer *c, double d );

#endif
x264-snapshot-20120103-2245-stable/output/flv_bytestream.c0000644000175000001440000000714111700673342022426 0ustar  videolanusers/*****************************************************************************
 * flv_bytestream.c: flv muxer utilities
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: Kieran Kunhya <kieran@kunhya.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "output.h"
#include "flv_bytestream.h"

uint64_t flv_dbl2int( double value )
{
    return (union {double f; uint64_t i;}){value}.i;
}

/* Put functions  */

void flv_put_byte( flv_buffer *c, uint8_t b )
{
    flv_append_data( c, &b, 1 );
}

void flv_put_be32( flv_buffer *c, uint32_t val )
{
    flv_put_byte( c, val >> 24 );
    flv_put_byte( c, val >> 16 );
    flv_put_byte( c, val >> 8 );
    flv_put_byte( c, val );
}

void flv_put_be64( flv_buffer *c, uint64_t val )
{
    flv_put_be32( c, val >> 32 );
    flv_put_be32( c, val );
}

void flv_put_be16( flv_buffer *c, uint16_t val )
{
    flv_put_byte( c, val >> 8 );
    flv_put_byte( c, val );
}

void flv_put_be24( flv_buffer *c, uint32_t val )
{
    flv_put_be16( c, val >> 8 );
    flv_put_byte( c, val );
}

void flv_put_tag( flv_buffer *c, const char *tag )
{
    while( *tag )
        flv_put_byte( c, *tag++ );
}

void flv_put_amf_string( flv_buffer *c, const char *str )
{
    uint16_t len = strlen( str );
    flv_put_be16( c, len );
    flv_append_data( c, (uint8_t*)str, len );
}

void flv_put_amf_double( flv_buffer *c, double d )
{
    flv_put_byte( c, AMF_DATA_TYPE_NUMBER );
    flv_put_be64( c, flv_dbl2int( d ) );
}

/* flv writing functions */

flv_buffer *flv_create_writer( const char *filename )
{
    flv_buffer *c = malloc( sizeof(*c) );

    if( !c )
        return NULL;
    memset( c, 0, sizeof(*c) );

    if( !strcmp( filename, "-" ) )
        c->fp = stdout;
    else
        c->fp = fopen( filename, "wb" );
    if( !c->fp )
    {
        free( c );
        return NULL;
    }

    return c;
}

int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size )
{
    unsigned ns = c->d_cur + size;

    if( ns > c->d_max )
    {
        void *dp;
        unsigned dn = 16;
        while( ns > dn )
            dn <<= 1;

        dp = realloc( c->data, dn );
        if( !dp )
            return -1;

        c->data = dp;
        c->d_max = dn;
    }

    memcpy( c->data + c->d_cur, data, size );

    c->d_cur = ns;

    return 0;
}

void flv_rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start )
{
     *(c->data + start + 0) = length >> 16;
     *(c->data + start + 1) = length >> 8;
     *(c->data + start + 2) = length >> 0;
}

int flv_flush_data( flv_buffer *c )
{
    if( !c->d_cur )
        return 0;

    if( fwrite( c->data, c->d_cur, 1, c->fp ) != 1 )
        return -1;

    c->d_total += c->d_cur;

    c->d_cur = 0;

    return 0;
}
x264-snapshot-20120103-2245-stable/output/flv.c0000644000175000001440000002461111700673342020170 0ustar  videolanusers/*****************************************************************************
 * flv.c: flv muxer
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: Kieran Kunhya <kieran@kunhya.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "output.h"
#include "flv_bytestream.h"

#define CHECK(x)\
do {\
    if( (x) < 0 )\
        return -1;\
} while( 0 )

typedef struct
{
    flv_buffer *c;

    uint8_t *sei;
    int sei_len;

    int64_t i_fps_num;
    int64_t i_fps_den;
    int64_t i_framenum;

    uint64_t i_framerate_pos;
    uint64_t i_duration_pos;
    uint64_t i_filesize_pos;
    uint64_t i_bitrate_pos;

    uint8_t b_write_length;
    int64_t i_prev_dts;
    int64_t i_prev_cts;
    int64_t i_delay_time;
    int64_t i_init_delta;
    int i_delay_frames;

    double d_timebase;
    int b_vfr_input;
    int b_dts_compress;

    unsigned start;
} flv_hnd_t;

static int write_header( flv_buffer *c )
{
    flv_put_tag( c, "FLV" ); // Signature
    flv_put_byte( c, 1 );    // Version
    flv_put_byte( c, 1 );    // Video Only
    flv_put_be32( c, 9 );    // DataOffset
    flv_put_be32( c, 0 );    // PreviousTagSize0

    return flv_flush_data( c );
}

static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
{
    flv_hnd_t *p_flv = malloc( sizeof(*p_flv) );
    *p_handle = NULL;
    if( !p_flv )
        return -1;
    memset( p_flv, 0, sizeof(*p_flv) );

    p_flv->b_dts_compress = opt->use_dts_compress;

    p_flv->c = flv_create_writer( psz_filename );
    if( !p_flv->c )
        return -1;

    CHECK( write_header( p_flv->c ) );
    *p_handle = p_flv;

    return 0;
}

static int set_param( hnd_t handle, x264_param_t *p_param )
{
    flv_hnd_t *p_flv = handle;
    flv_buffer *c = p_flv->c;

    flv_put_byte( c, FLV_TAG_TYPE_META ); // Tag Type "script data"

    int start = c->d_cur;
    flv_put_be24( c, 0 ); // data length
    flv_put_be24( c, 0 ); // timestamp
    flv_put_be32( c, 0 ); // reserved

    flv_put_byte( c, AMF_DATA_TYPE_STRING );
    flv_put_amf_string( c, "onMetaData" );

    flv_put_byte( c, AMF_DATA_TYPE_MIXEDARRAY );
    flv_put_be32( c, 7 );

    flv_put_amf_string( c, "width" );
    flv_put_amf_double( c, p_param->i_width );

    flv_put_amf_string( c, "height" );
    flv_put_amf_double( c, p_param->i_height );

    flv_put_amf_string( c, "framerate" );

    if( !p_param->b_vfr_input )
        flv_put_amf_double( c, (double)p_param->i_fps_num / p_param->i_fps_den );
    else
    {
        p_flv->i_framerate_pos = c->d_cur + c->d_total + 1;
        flv_put_amf_double( c, 0 ); // written at end of encoding
    }

    flv_put_amf_string( c, "videocodecid" );
    flv_put_amf_double( c, FLV_CODECID_H264 );

    flv_put_amf_string( c, "duration" );
    p_flv->i_duration_pos = c->d_cur + c->d_total + 1;
    flv_put_amf_double( c, 0 ); // written at end of encoding

    flv_put_amf_string( c, "filesize" );
    p_flv->i_filesize_pos = c->d_cur + c->d_total + 1;
    flv_put_amf_double( c, 0 ); // written at end of encoding

    flv_put_amf_string( c, "videodatarate" );
    p_flv->i_bitrate_pos = c->d_cur + c->d_total + 1;
    flv_put_amf_double( c, 0 ); // written at end of encoding

    flv_put_amf_string( c, "" );
    flv_put_byte( c, AMF_END_OF_OBJECT );

    unsigned length = c->d_cur - start;
    flv_rewrite_amf_be24( c, length - 10, start );

    flv_put_be32( c, length + 1 ); // tag length

    p_flv->i_fps_num = p_param->i_fps_num;
    p_flv->i_fps_den = p_param->i_fps_den;
    p_flv->d_timebase = (double)p_param->i_timebase_num / p_param->i_timebase_den;
    p_flv->b_vfr_input = p_param->b_vfr_input;
    p_flv->i_delay_frames = p_param->i_bframe ? (p_param->i_bframe_pyramid ? 2 : 1) : 0;

    return 0;
}

static int write_headers( hnd_t handle, x264_nal_t *p_nal )
{
    flv_hnd_t *p_flv = handle;
    flv_buffer *c = p_flv->c;

    int sps_size = p_nal[0].i_payload;
    int pps_size = p_nal[1].i_payload;
    int sei_size = p_nal[2].i_payload;

    // SEI
    /* It is within the spec to write this as-is but for
     * mplayer/ffmpeg playback this is deferred until before the first frame */

    p_flv->sei = malloc( sei_size );
    if( !p_flv->sei )
        return -1;
    p_flv->sei_len = sei_size;

    memcpy( p_flv->sei, p_nal[2].p_payload, sei_size );

    // SPS
    uint8_t *sps = p_nal[0].p_payload + 4;

    flv_put_byte( c, FLV_TAG_TYPE_VIDEO );
    flv_put_be24( c, 0 ); // rewrite later
    flv_put_be24( c, 0 ); // timestamp
    flv_put_byte( c, 0 ); // timestamp extended
    flv_put_be24( c, 0 ); // StreamID - Always 0
    p_flv->start = c->d_cur; // needed for overwriting length

    flv_put_byte( c, 7 | FLV_FRAME_KEY ); // Frametype and CodecID
    flv_put_byte( c, 0 ); // AVC sequence header
    flv_put_be24( c, 0 ); // composition time

    flv_put_byte( c, 1 );      // version
    flv_put_byte( c, sps[1] ); // profile
    flv_put_byte( c, sps[2] ); // profile
    flv_put_byte( c, sps[3] ); // level
    flv_put_byte( c, 0xff );   // 6 bits reserved (111111) + 2 bits nal size length - 1 (11)
    flv_put_byte( c, 0xe1 );   // 3 bits reserved (111) + 5 bits number of sps (00001)

    flv_put_be16( c, sps_size - 4 );
    flv_append_data( c, sps, sps_size - 4 );

    // PPS
    flv_put_byte( c, 1 ); // number of pps
    flv_put_be16( c, pps_size - 4 );
    flv_append_data( c, p_nal[1].p_payload + 4, pps_size - 4 );

    // rewrite data length info
    unsigned length = c->d_cur - p_flv->start;
    flv_rewrite_amf_be24( c, length, p_flv->start - 10 );
    flv_put_be32( c, length + 11 ); // Last tag size
    CHECK( flv_flush_data( c ) );

    return sei_size + sps_size + pps_size;
}

static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
{
    flv_hnd_t *p_flv = handle;
    flv_buffer *c = p_flv->c;

#define convert_timebase_ms( timestamp, timebase ) (int64_t)((timestamp) * (timebase) * 1000 + 0.5)

    if( !p_flv->i_framenum )
    {
        p_flv->i_delay_time = p_picture->i_dts * -1;
        if( !p_flv->b_dts_compress && p_flv->i_delay_time )
            x264_cli_log( "flv", X264_LOG_INFO, "initial delay %"PRId64" ms\n",
                          convert_timebase_ms( p_picture->i_pts + p_flv->i_delay_time, p_flv->d_timebase ) );
    }

    int64_t dts;
    int64_t cts;
    int64_t offset;

    if( p_flv->b_dts_compress )
    {
        if( p_flv->i_framenum == 1 )
            p_flv->i_init_delta = convert_timebase_ms( p_picture->i_dts + p_flv->i_delay_time, p_flv->d_timebase );
        dts = p_flv->i_framenum > p_flv->i_delay_frames
            ? convert_timebase_ms( p_picture->i_dts, p_flv->d_timebase )
            : p_flv->i_framenum * p_flv->i_init_delta / (p_flv->i_delay_frames + 1);
        cts = convert_timebase_ms( p_picture->i_pts, p_flv->d_timebase );
    }
    else
    {
        dts = convert_timebase_ms( p_picture->i_dts + p_flv->i_delay_time, p_flv->d_timebase );
        cts = convert_timebase_ms( p_picture->i_pts + p_flv->i_delay_time, p_flv->d_timebase );
    }
    offset = cts - dts;

    if( p_flv->i_framenum )
    {
        if( p_flv->i_prev_dts == dts )
            x264_cli_log( "flv", X264_LOG_WARNING, "duplicate DTS %"PRId64" generated by rounding\n"
                          "               decoding framerate cannot exceed 1000fps\n", dts );
        if( p_flv->i_prev_cts == cts )
            x264_cli_log( "flv", X264_LOG_WARNING, "duplicate CTS %"PRId64" generated by rounding\n"
                          "               composition framerate cannot exceed 1000fps\n", cts );
    }
    p_flv->i_prev_dts = dts;
    p_flv->i_prev_cts = cts;

    // A new frame - write packet header
    flv_put_byte( c, FLV_TAG_TYPE_VIDEO );
    flv_put_be24( c, 0 ); // calculated later
    flv_put_be24( c, dts );
    flv_put_byte( c, dts >> 24 );
    flv_put_be24( c, 0 );

    p_flv->start = c->d_cur;
    flv_put_byte( c, p_picture->b_keyframe ? FLV_FRAME_KEY : FLV_FRAME_INTER );
    flv_put_byte( c, 1 ); // AVC NALU
    flv_put_be24( c, offset );

    if( p_flv->sei )
    {
        flv_append_data( c, p_flv->sei, p_flv->sei_len );
        free( p_flv->sei );
        p_flv->sei = NULL;
    }
    flv_append_data( c, p_nalu, i_size );

    unsigned length = c->d_cur - p_flv->start;
    flv_rewrite_amf_be24( c, length, p_flv->start - 10 );
    flv_put_be32( c, 11 + length ); // Last tag size
    CHECK( flv_flush_data( c ) );

    p_flv->i_framenum++;

    return i_size;
}

static void rewrite_amf_double( FILE *fp, uint64_t position, double value )
{
    uint64_t x = endian_fix64( flv_dbl2int( value ) );
    fseek( fp, position, SEEK_SET );
    fwrite( &x, 8, 1, fp );
}

static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
{
    flv_hnd_t *p_flv = handle;
    flv_buffer *c = p_flv->c;

    CHECK( flv_flush_data( c ) );

    double total_duration = (2 * largest_pts - second_largest_pts) * p_flv->d_timebase;

    if( x264_is_regular_file( c->fp ) && total_duration > 0 )
    {
        double framerate;
        uint64_t filesize = ftell( c->fp );

        if( p_flv->i_framerate_pos )
        {
            framerate = (double)p_flv->i_framenum / total_duration;
            rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate );
        }

        rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration );
        rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize );
        rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) );
    }

    fclose( c->fp );
    free( p_flv );
    free( c );

    return 0;
}

const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
x264-snapshot-20120103-2245-stable/input/0000755000175000001440000000000011700673342017030 5ustar  videolanusersx264-snapshot-20120103-2245-stable/input/lavf.c0000644000175000001440000002005511700673342020126 0ustar  videolanusers/*****************************************************************************
 * lavf.c: libavformat input
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: Mike Gurlitz <mike.gurlitz@gmail.com>
 *          Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "input.h"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "lavf", __VA_ARGS__ )
#undef DECLARE_ALIGNED
#include <libavformat/avformat.h>
#include <libavutil/pixdesc.h>
#include <libavutil/dict.h>

typedef struct
{
    AVFormatContext *lavf;
    int stream_id;
    int next_frame;
    int vfr_input;
    cli_pic_t *first_pic;
} lavf_hnd_t;

#define x264_free_packet( pkt )\
{\
    av_free_packet( pkt );\
    av_init_packet( pkt );\
}

static int read_frame_internal( cli_pic_t *p_pic, lavf_hnd_t *h, int i_frame, video_info_t *info )
{
    if( h->first_pic && !info )
    {
        /* see if the frame we are requesting is the frame we have already read and stored.
         * if so, retrieve the pts and image data before freeing it. */
        if( !i_frame )
        {
            XCHG( cli_image_t, p_pic->img, h->first_pic->img );
            p_pic->pts = h->first_pic->pts;
            XCHG( void*, p_pic->opaque, h->first_pic->opaque );
        }
        lavf_input.release_frame( h->first_pic, NULL );
        lavf_input.picture_clean( h->first_pic );
        free( h->first_pic );
        h->first_pic = NULL;
        if( !i_frame )
            return 0;
    }

    AVCodecContext *c = h->lavf->streams[h->stream_id]->codec;
    AVPacket *pkt = p_pic->opaque;
    AVFrame frame;
    avcodec_get_frame_defaults( &frame );

    while( i_frame >= h->next_frame )
    {
        int finished = 0;
        int ret = 0;
        do
        {
            ret = av_read_frame( h->lavf, pkt );

            if( pkt->stream_index == h->stream_id )
            {
                if( ret < 0 )
                    pkt->size = 0;

                c->reordered_opaque = pkt->pts;
                if( avcodec_decode_video2( c, &frame, &finished, pkt ) < 0 )
                    x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame );
            }
            /* if the packet successfully decoded but the data from it is not desired, free it */
            else if( ret >= 0 )
                x264_free_packet( pkt );
        } while( !finished && ret >= 0 );

        if( !finished )
            return -1;

        h->next_frame++;
    }

    memcpy( p_pic->img.stride, frame.linesize, sizeof(p_pic->img.stride) );
    memcpy( p_pic->img.plane, frame.data, sizeof(p_pic->img.plane) );
    p_pic->img.height  = c->height;
    p_pic->img.csp     = c->pix_fmt | X264_CSP_OTHER;
    p_pic->img.width   = c->width;

    if( info )
    {
        info->interlaced = frame.interlaced_frame;
        info->tff = frame.top_field_first;
    }

    if( h->vfr_input )
    {
        p_pic->pts = p_pic->duration = 0;
        if( c->has_b_frames && frame.reordered_opaque != AV_NOPTS_VALUE )
            p_pic->pts = frame.reordered_opaque;
        else if( pkt->dts != AV_NOPTS_VALUE )
            p_pic->pts = pkt->dts; // for AVI files
        else if( info )
        {
            h->vfr_input = info->vfr = 0;
            return 0;
        }
    }

    return 0;
}

static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
    lavf_hnd_t *h = calloc( 1, sizeof(lavf_hnd_t) );
    if( !h )
        return -1;
    av_register_all();
    if( !strcmp( psz_filename, "-" ) )
        psz_filename = "pipe:";

    /* if resolution was passed in, place it and colorspace into options. this allows raw video support */
    AVDictionary *options = NULL;
    if( opt->resolution )
    {
        av_dict_set( &options, "video_size", opt->resolution, 0 );
        const char *csp = opt->colorspace ? opt->colorspace : av_get_pix_fmt_name( PIX_FMT_YUV420P );
        av_dict_set( &options, "pixel_format", csp, 0 );
    }

    /* specify the input format. this is helpful when lavf fails to guess */
    AVInputFormat *format = NULL;
    if( opt->format )
        FAIL_IF_ERROR( !(format = av_find_input_format( opt->format )), "unknown file format: %s\n", opt->format );

    FAIL_IF_ERROR( avformat_open_input( &h->lavf, psz_filename, format, &options ), "could not open input file\n" )
    if( options )
        av_dict_free( &options );
    FAIL_IF_ERROR( avformat_find_stream_info( h->lavf, NULL ) < 0, "could not find input stream info\n" )

    int i = 0;
    while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != AVMEDIA_TYPE_VIDEO )
        i++;
    FAIL_IF_ERROR( i == h->lavf->nb_streams, "could not find video stream\n" )
    h->stream_id       = i;
    h->next_frame      = 0;
    AVCodecContext *c  = h->lavf->streams[i]->codec;
    info->fps_num      = h->lavf->streams[i]->r_frame_rate.num;
    info->fps_den      = h->lavf->streams[i]->r_frame_rate.den;
    info->timebase_num = h->lavf->streams[i]->time_base.num;
    info->timebase_den = h->lavf->streams[i]->time_base.den;
    /* lavf is thread unsafe as calling av_read_frame invalidates previously read AVPackets */
    info->thread_safe  = 0;
    h->vfr_input       = info->vfr;
    FAIL_IF_ERROR( avcodec_open2( c, avcodec_find_decoder( c->codec_id ), NULL ),
                   "could not find decoder for video stream\n" )

    /* prefetch the first frame and set/confirm flags */
    h->first_pic = malloc( sizeof(cli_pic_t) );
    FAIL_IF_ERROR( !h->first_pic || lavf_input.picture_alloc( h->first_pic, X264_CSP_OTHER, info->width, info->height ),
                   "malloc failed\n" )
    else if( read_frame_internal( h->first_pic, h, 0, info ) )
        return -1;

    info->width      = c->width;
    info->height     = c->height;
    info->csp        = h->first_pic->img.csp;
    info->num_frames = h->lavf->streams[i]->nb_frames;
    info->sar_height = c->sample_aspect_ratio.den;
    info->sar_width  = c->sample_aspect_ratio.num;

    /* avisynth stores rgb data vertically flipped. */
    if( !strcasecmp( get_filename_extension( psz_filename ), "avs" ) &&
        (c->pix_fmt == PIX_FMT_BGRA || c->pix_fmt == PIX_FMT_BGR24) )
        info->csp |= X264_CSP_VFLIP;

    *p_handle = h;

    return 0;
}

static int picture_alloc( cli_pic_t *pic, int csp, int width, int height )
{
    if( x264_cli_pic_alloc( pic, csp, width, height ) )
        return -1;
    pic->img.planes = 4;
    pic->opaque = malloc( sizeof(AVPacket) );
    if( !pic->opaque )
        return -1;
    av_init_packet( pic->opaque );
    return 0;
}

static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
{
    return read_frame_internal( pic, handle, i_frame, NULL );
}

static int release_frame( cli_pic_t *pic, hnd_t handle )
{
    x264_free_packet( pic->opaque );
    return 0;
}

static void picture_clean( cli_pic_t *pic )
{
    free( pic->opaque );
    memset( pic, 0, sizeof(cli_pic_t) );
}

static int close_file( hnd_t handle )
{
    lavf_hnd_t *h = handle;
    avcodec_close( h->lavf->streams[h->stream_id]->codec );
    av_close_input_file( h->lavf );
    free( h );
    return 0;
}

const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file };
x264-snapshot-20120103-2245-stable/input/input.h0000644000175000001440000001016711700673342020345 0ustar  videolanusers/*****************************************************************************
 * input.h: file input
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_INPUT_H
#define X264_INPUT_H

#include "x264cli.h"

/* options that are used by only some demuxers */
typedef struct
{
    char *index_file;
    char *format;
    char *resolution;
    char *colorspace;
    int bit_depth;
    char *timebase;
    int seek;
    int progress;
    int output_csp; /* convert to this csp, if applicable */
} cli_input_opt_t;

/* properties of the source given by the demuxer */
typedef struct
{
    int csp;         /* colorspace of the input */
    uint32_t fps_num;
    uint32_t fps_den;
    int height;
    int interlaced;
    int num_frames;
    uint32_t sar_width;
    uint32_t sar_height;
    int tff;
    int thread_safe; /* demuxer is thread_input safe */
    uint32_t timebase_num;
    uint32_t timebase_den;
    int vfr;
    int width;
} video_info_t;

/* image data type used by x264cli */
typedef struct
{
    int     csp;       /* colorspace */
    int     width;     /* width of the picture */
    int     height;    /* height of the picture */
    int     planes;    /* number of planes */
    uint8_t *plane[4]; /* pointers for each plane */
    int     stride[4]; /* strides for each plane */
} cli_image_t;

typedef struct
{
    cli_image_t img;
    int64_t pts;       /* input pts */
    int64_t duration;  /* frame duration - used for vfr */
    void    *opaque;   /* opaque handle */
} cli_pic_t;

typedef struct
{
    int (*open_file)( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt );
    int (*picture_alloc)( cli_pic_t *pic, int csp, int width, int height );
    int (*read_frame)( cli_pic_t *pic, hnd_t handle, int i_frame );
    int (*release_frame)( cli_pic_t *pic, hnd_t handle );
    void (*picture_clean)( cli_pic_t *pic );
    int (*close_file)( hnd_t handle );
} cli_input_t;

extern const cli_input_t raw_input;
extern const cli_input_t y4m_input;
extern const cli_input_t avs_input;
extern cli_input_t thread_input;
extern const cli_input_t lavf_input;
extern const cli_input_t ffms_input;
extern cli_input_t timecode_input;

extern cli_input_t cli_input;

/* extended colorspace list that isn't supported by libx264 but by the cli */
#define X264_CSP_CLI_MAX        X264_CSP_MAX     /* end of list         */
#define X264_CSP_OTHER          0x4000           /* non x264 colorspace */

typedef struct
{
    const char *name;
    int planes;
    float width[4];
    float height[4];
    int mod_width;
    int mod_height;
} x264_cli_csp_t;

extern const x264_cli_csp_t x264_cli_csps[];

int      x264_cli_csp_is_invalid( int csp );
int      x264_cli_csp_depth_factor( int csp );
int      x264_cli_pic_alloc( cli_pic_t *pic, int csp, int width, int height );
void     x264_cli_pic_clean( cli_pic_t *pic );
uint64_t x264_cli_pic_plane_size( int csp, int width, int height, int plane );
uint64_t x264_cli_pic_size( int csp, int width, int height );
const x264_cli_csp_t *x264_cli_get_csp( int csp );

#endif
x264-snapshot-20120103-2245-stable/input/ffms.c0000644000175000001440000001513211700673342020131 0ustar  videolanusers/*****************************************************************************
 * ffms.c: ffmpegsource input
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: Mike Gurlitz <mike.gurlitz@gmail.com>
 *          Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "input.h"
#include <ffms.h>
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "ffms", __VA_ARGS__ )

#undef DECLARE_ALIGNED
#include <libavcodec/avcodec.h>
#include <libswscale/swscale.h>

#ifdef _WIN32
#include <windows.h>
#else
#define SetConsoleTitle(t)
#endif

typedef struct
{
    FFMS_VideoSource *video_source;
    FFMS_Track *track;
    int reduce_pts;
    int vfr_input;
    int num_frames;
    int64_t time;
} ffms_hnd_t;

static int FFMS_CC update_progress( int64_t current, int64_t total, void *private )
{
    int64_t *update_time = private;
    int64_t oldtime = *update_time;
    int64_t newtime = x264_mdate();
    if( oldtime && newtime - oldtime < UPDATE_INTERVAL )
        return 0;
    *update_time = newtime;

    char buf[200];
    sprintf( buf, "ffms [info]: indexing input file [%.1f%%]", 100.0 * current / total );
    fprintf( stderr, "%s  \r", buf+5 );
    SetConsoleTitle( buf );
    fflush( stderr );
    return 0;
}

static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
    ffms_hnd_t *h = calloc( 1, sizeof(ffms_hnd_t) );
    if( !h )
        return -1;
    FFMS_Init( 0, 0 );
    FFMS_ErrorInfo e;
    e.BufferSize = 0;
    int seekmode = opt->seek ? FFMS_SEEK_NORMAL : FFMS_SEEK_LINEAR_NO_RW;

    FFMS_Index *idx = NULL;
    if( opt->index_file )
    {
        struct stat index_s, input_s;
        if( !stat( opt->index_file, &index_s ) && !stat( psz_filename, &input_s ) &&
            input_s.st_mtime < index_s.st_mtime )
            idx = FFMS_ReadIndex( opt->index_file, &e );
    }
    if( !idx )
    {
        if( opt->progress )
        {
            idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, update_progress, &h->time, &e );
            fprintf( stderr, "                                            \r" );
        }
        else
            idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, NULL, NULL, &e );
        FAIL_IF_ERROR( !idx, "could not create index\n" )
        if( opt->index_file && FFMS_WriteIndex( opt->index_file, idx, &e ) )
            x264_cli_log( "ffms", X264_LOG_WARNING, "could not write index file\n" );
    }

    int trackno = FFMS_GetFirstTrackOfType( idx, FFMS_TYPE_VIDEO, &e );
    FAIL_IF_ERROR( trackno < 0, "could not find video track\n" )

    h->video_source = FFMS_CreateVideoSource( psz_filename, trackno, idx, 1, seekmode, &e );
    FAIL_IF_ERROR( !h->video_source, "could not create video source\n" )

    h->track = FFMS_GetTrackFromVideo( h->video_source );

    FFMS_DestroyIndex( idx );
    const FFMS_VideoProperties *videop = FFMS_GetVideoProperties( h->video_source );
    info->num_frames   = h->num_frames = videop->NumFrames;
    info->sar_height   = videop->SARDen;
    info->sar_width    = videop->SARNum;
    info->fps_den      = videop->FPSDenominator;
    info->fps_num      = videop->FPSNumerator;
    h->vfr_input       = info->vfr;
    /* ffms is thread unsafe as it uses a single frame buffer for all frame requests */
    info->thread_safe  = 0;

    const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, 0, &e );
    FAIL_IF_ERROR( !frame, "could not read frame 0\n" )

    info->width      = frame->EncodedWidth;
    info->height     = frame->EncodedHeight;
    info->csp        = frame->EncodedPixelFormat | X264_CSP_OTHER;
    info->interlaced = frame->InterlacedFrame;
    info->tff        = frame->TopFieldFirst;

    /* ffms timestamps are in milliseconds. ffms also uses int64_ts for timebase,
     * so we need to reduce large timebases to prevent overflow */
    if( h->vfr_input )
    {
        const FFMS_TrackTimeBase *timebase = FFMS_GetTimeBase( h->track );
        int64_t timebase_num = timebase->Num;
        int64_t timebase_den = timebase->Den * 1000;
        h->reduce_pts = 0;

        while( timebase_num > UINT32_MAX || timebase_den > INT32_MAX )
        {
            timebase_num >>= 1;
            timebase_den >>= 1;
            h->reduce_pts++;
        }
        info->timebase_num = timebase_num;
        info->timebase_den = timebase_den;
    }

    *p_handle = h;
    return 0;
}

static int picture_alloc( cli_pic_t *pic, int csp, int width, int height )
{
    if( x264_cli_pic_alloc( pic, csp, width, height ) )
        return -1;
    pic->img.planes = 4;
    return 0;
}

static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
{
    ffms_hnd_t *h = handle;
    if( i_frame >= h->num_frames )
        return -1;
    FFMS_ErrorInfo e;
    e.BufferSize = 0;
    const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, i_frame, &e );
    FAIL_IF_ERROR( !frame, "could not read frame %d \n", i_frame )

    memcpy( pic->img.stride, frame->Linesize, sizeof(pic->img.stride) );
    memcpy( pic->img.plane, frame->Data, sizeof(pic->img.plane) );

    if( h->vfr_input )
    {
        const FFMS_FrameInfo *info = FFMS_GetFrameInfo( h->track, i_frame );
        FAIL_IF_ERROR( info->PTS == AV_NOPTS_VALUE, "invalid timestamp. "
                       "Use --force-cfr and specify a framerate with --fps\n" )

        pic->pts = info->PTS >> h->reduce_pts;
        pic->duration = 0;
    }
    return 0;
}

static void picture_clean( cli_pic_t *pic )
{
    memset( pic, 0, sizeof(cli_pic_t) );
}

static int close_file( hnd_t handle )
{
    ffms_hnd_t *h = handle;
    FFMS_DestroyVideoSource( h->video_source );
    free( h );
    return 0;
}

const cli_input_t ffms_input = { open_file, picture_alloc, read_frame, NULL, picture_clean, close_file };
x264-snapshot-20120103-2245-stable/input/avs.c0000644000175000001440000003227311700673342017774 0ustar  videolanusers/*****************************************************************************
 * avs.c: avisynth input
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "input.h"
#include <windows.h>
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "avs", __VA_ARGS__ )

#define AVSC_NO_DECLSPEC
#undef EXTERN_C
#include "extras/avisynth_c.h"
#define AVSC_DECLARE_FUNC(name) name##_func name

/* AVS uses a versioned interface to control backwards compatibility */
/* YV12 support is required, which was added in 2.5 */
#define AVS_INTERFACE_25 2

#if HAVE_SWSCALE
#include <libavutil/pixfmt.h>
#endif

/* maximum size of the sequence of filters to try on non script files */
#define AVS_MAX_SEQUENCE 5

#define LOAD_AVS_FUNC(name, continue_on_fail)\
{\
    h->func.name = (void*)GetProcAddress( h->library, #name );\
    if( !continue_on_fail && !h->func.name )\
        goto fail;\
}

typedef struct
{
    AVS_Clip *clip;
    AVS_ScriptEnvironment *env;
    HMODULE library;
    int num_frames;
    struct
    {
        AVSC_DECLARE_FUNC( avs_clip_get_error );
        AVSC_DECLARE_FUNC( avs_create_script_environment );
        AVSC_DECLARE_FUNC( avs_delete_script_environment );
        AVSC_DECLARE_FUNC( avs_get_error );
        AVSC_DECLARE_FUNC( avs_get_frame );
        AVSC_DECLARE_FUNC( avs_get_video_info );
        AVSC_DECLARE_FUNC( avs_function_exists );
        AVSC_DECLARE_FUNC( avs_invoke );
        AVSC_DECLARE_FUNC( avs_release_clip );
        AVSC_DECLARE_FUNC( avs_release_value );
        AVSC_DECLARE_FUNC( avs_release_video_frame );
        AVSC_DECLARE_FUNC( avs_take_clip );
    } func;
} avs_hnd_t;

/* load the library and functions we require from it */
static int x264_avs_load_library( avs_hnd_t *h )
{
    h->library = LoadLibrary( "avisynth" );
    if( !h->library )
        return -1;
    LOAD_AVS_FUNC( avs_clip_get_error, 0 );
    LOAD_AVS_FUNC( avs_create_script_environment, 0 );
    LOAD_AVS_FUNC( avs_delete_script_environment, 1 );
    LOAD_AVS_FUNC( avs_get_error, 1 );
    LOAD_AVS_FUNC( avs_get_frame, 0 );
    LOAD_AVS_FUNC( avs_get_video_info, 0 );
    LOAD_AVS_FUNC( avs_function_exists, 0 );
    LOAD_AVS_FUNC( avs_invoke, 0 );
    LOAD_AVS_FUNC( avs_release_clip, 0 );
    LOAD_AVS_FUNC( avs_release_value, 0 );
    LOAD_AVS_FUNC( avs_release_video_frame, 0 );
    LOAD_AVS_FUNC( avs_take_clip, 0 );
    return 0;
fail:
    FreeLibrary( h->library );
    return -1;
}

/* generate a filter sequence to try based on the filename extension */
static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] )
{
    int i = 0;
    const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 };
    if( !strcasecmp( filename_ext, "avi" ) )
        filter[i++] = "AVISource";
    if( !strcasecmp( filename_ext, "d2v" ) )
        filter[i++] = "MPEG2Source";
    if( !strcasecmp( filename_ext, "dga" ) )
        filter[i++] = "AVCSource";
    for( int j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ )
        filter[i++] = all_purpose[j];
}

static AVS_Value update_clip( avs_hnd_t *h, const AVS_VideoInfo **vi, AVS_Value res, AVS_Value release )
{
    h->func.avs_release_clip( h->clip );
    h->clip = h->func.avs_take_clip( res, h->env );
    h->func.avs_release_value( release );
    *vi = h->func.avs_get_video_info( h->clip );
    return res;
}

static float get_avs_version( avs_hnd_t *h )
{
    FAIL_IF_ERROR( !h->func.avs_function_exists( h->env, "VersionNumber" ), "VersionNumber does not exist\n" )
    AVS_Value ver = h->func.avs_invoke( h->env, "VersionNumber", avs_new_value_array( NULL, 0 ), NULL );
    FAIL_IF_ERROR( avs_is_error( ver ), "unable to determine avisynth version: %s\n", avs_as_error( ver ) )
    FAIL_IF_ERROR( !avs_is_float( ver ), "VersionNumber did not return a float value\n" );
    float ret = avs_as_float( ver );
    h->func.avs_release_value( ver );
    return ret;
}

static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
    FILE *fh = fopen( psz_filename, "r" );
    if( !fh )
        return -1;
    FAIL_IF_ERROR( !x264_is_regular_file( fh ), "AVS input is incompatible with non-regular file `%s'\n", psz_filename );
    fclose( fh );

    avs_hnd_t *h = malloc( sizeof(avs_hnd_t) );
    if( !h )
        return -1;
    FAIL_IF_ERROR( x264_avs_load_library( h ), "failed to load avisynth\n" )
    h->env = h->func.avs_create_script_environment( AVS_INTERFACE_25 );
    if( h->func.avs_get_error )
    {
        const char *error = h->func.avs_get_error( h->env );
        FAIL_IF_ERROR( error, "%s\n", error );
    }
    float avs_version = get_avs_version( h );
    if( avs_version <= 0 )
        return -1;
    x264_cli_log( "avs", X264_LOG_DEBUG, "using avisynth version %.2f\n", avs_version );
    AVS_Value arg = avs_new_value_string( psz_filename );
    AVS_Value res;
    char *filename_ext = get_filename_extension( psz_filename );

    if( !strcasecmp( filename_ext, "avs" ) )
    {
        res = h->func.avs_invoke( h->env, "Import", arg, NULL );
        FAIL_IF_ERROR( avs_is_error( res ), "%s\n", avs_as_string( res ) )
        /* check if the user is using a multi-threaded script and apply distributor if necessary.
           adapted from avisynth's vfw interface */
        AVS_Value mt_test = h->func.avs_invoke( h->env, "GetMTMode", avs_new_value_bool( 0 ), NULL );
        int mt_mode = avs_is_int( mt_test ) ? avs_as_int( mt_test ) : 0;
        h->func.avs_release_value( mt_test );
        if( mt_mode > 0 && mt_mode < 5 )
        {
            AVS_Value temp = h->func.avs_invoke( h->env, "Distributor", res, NULL );
            h->func.avs_release_value( res );
            res = temp;
        }
    }
    else /* non script file */
    {
        /* cycle through known source filters to find one that works */
        const char *filter[AVS_MAX_SEQUENCE+1] = { 0 };
        avs_build_filter_sequence( filename_ext, filter );
        int i;
        for( i = 0; filter[i]; i++ )
        {
            x264_cli_log( "avs", X264_LOG_INFO, "trying %s... ", filter[i] );
            if( !h->func.avs_function_exists( h->env, filter[i] ) )
            {
                x264_cli_printf( X264_LOG_INFO, "not found\n" );
                continue;
            }
            if( !strncasecmp( filter[i], "FFmpegSource", 12 ) )
            {
                x264_cli_printf( X264_LOG_INFO, "indexing... " );
                fflush( stderr );
            }
            res = h->func.avs_invoke( h->env, filter[i], arg, NULL );
            if( !avs_is_error( res ) )
            {
                x264_cli_printf( X264_LOG_INFO, "succeeded\n" );
                break;
            }
            x264_cli_printf( X264_LOG_INFO, "failed\n" );
        }
        FAIL_IF_ERROR( !filter[i], "unable to find source filter to open `%s'\n", psz_filename )
    }
    FAIL_IF_ERROR( !avs_is_clip( res ), "`%s' didn't return a video clip\n", psz_filename )
    h->clip = h->func.avs_take_clip( res, h->env );
    const AVS_VideoInfo *vi = h->func.avs_get_video_info( h->clip );
    FAIL_IF_ERROR( !avs_has_video( vi ), "`%s' has no video data\n", psz_filename )
    /* if the clip is made of fields instead of frames, call weave to make them frames */
    if( avs_is_field_based( vi ) )
    {
        x264_cli_log( "avs", X264_LOG_WARNING, "detected fieldbased (separated) input, weaving to frames\n" );
        AVS_Value tmp = h->func.avs_invoke( h->env, "Weave", res, NULL );
        FAIL_IF_ERROR( avs_is_error( tmp ), "couldn't weave fields into frames\n" )
        res = update_clip( h, &vi, tmp, res );
        info->interlaced = 1;
        info->tff = avs_is_tff( vi );
    }
#if !HAVE_SWSCALE
    /* if swscale is not available, convert the CSP if necessary */
    if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) ||
        (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
    {
        FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
                       "avisynth >= 2.6 is required for i422/i444 output\n" )

        const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" :
                          opt->output_csp == X264_CSP_I422 ? "YV16" :
                          opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB";
        x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to %s\n", csp );
        FAIL_IF_ERROR( opt->output_csp < X264_CSP_I444 && (vi->width&1),
                       "input clip width not divisible by 2 (%dx%d)\n", vi->width, vi->height )
        FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && info->interlaced && (vi->height&3),
                       "input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height )
        FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 || info->interlaced) && (vi->height&1),
                       "input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
        const char *arg_name[2] = { NULL, "interlaced" };
        AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
        char conv_func[14] = { "ConvertTo" };
        strcat( conv_func, csp );
        AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, 2 ), arg_name );
        FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s\n", csp )
        res = update_clip( h, &vi, res2, res );
    }
#endif
    h->func.avs_release_value( res );

    info->width   = vi->width;
    info->height  = vi->height;
    info->fps_num = vi->fps_numerator;
    info->fps_den = vi->fps_denominator;
    h->num_frames = info->num_frames = vi->num_frames;
    info->thread_safe = 1;
    if( avs_is_rgb32( vi ) )
        info->csp = X264_CSP_BGRA | X264_CSP_VFLIP;
    else if( avs_is_rgb24( vi ) )
        info->csp = X264_CSP_BGR | X264_CSP_VFLIP;
    else if( avs_is_yv24( vi ) )
        info->csp = X264_CSP_I444;
    else if( avs_is_yv16( vi ) )
        info->csp = X264_CSP_I422;
    else if( avs_is_yv12( vi ) )
        info->csp = X264_CSP_I420;
#if HAVE_SWSCALE
    else if( avs_is_yuy2( vi ) )
        info->csp = PIX_FMT_YUYV422 | X264_CSP_OTHER;
    else if( avs_is_yv411( vi ) )
        info->csp = PIX_FMT_YUV411P | X264_CSP_OTHER;
    else if( avs_is_y8( vi ) )
        info->csp = PIX_FMT_GRAY8 | X264_CSP_OTHER;
#endif
    else
        info->csp = X264_CSP_NONE;
    info->vfr = 0;

    *p_handle = h;
    return 0;
}

static int picture_alloc( cli_pic_t *pic, int csp, int width, int height )
{
    if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) )
        return -1;
    pic->img.csp = csp;
    const x264_cli_csp_t *cli_csp = x264_cli_get_csp( csp );
    if( cli_csp )
        pic->img.planes = cli_csp->planes;
#if HAVE_SWSCALE
    else if( csp == (PIX_FMT_YUV411P | X264_CSP_OTHER) )
        pic->img.planes = 3;
    else
        pic->img.planes = 1; //y8 and yuy2 are one plane
#endif
    return 0;
}

static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
{
    static const int plane[3] = { AVS_PLANAR_Y, AVS_PLANAR_U, AVS_PLANAR_V };
    avs_hnd_t *h = handle;
    if( i_frame >= h->num_frames )
        return -1;
    AVS_VideoFrame *frm = pic->opaque = h->func.avs_get_frame( h->clip, i_frame );
    const char *err = h->func.avs_clip_get_error( h->clip );
    FAIL_IF_ERROR( err, "%s occurred while reading frame %d\n", err, i_frame )
    for( int i = 0; i < pic->img.planes; i++ )
    {
        /* explicitly cast away the const attribute to avoid a warning */
        pic->img.plane[i] = (uint8_t*)avs_get_read_ptr_p( frm, plane[i] );
        pic->img.stride[i] = avs_get_pitch_p( frm, plane[i] );
    }
    return 0;
}

static int release_frame( cli_pic_t *pic, hnd_t handle )
{
    avs_hnd_t *h = handle;
    h->func.avs_release_video_frame( pic->opaque );
    return 0;
}

static void picture_clean( cli_pic_t *pic )
{
    memset( pic, 0, sizeof(cli_pic_t) );
}

static int close_file( hnd_t handle )
{
    avs_hnd_t *h = handle;
    h->func.avs_release_clip( h->clip );
    if( h->func.avs_delete_script_environment )
        h->func.avs_delete_script_environment( h->env );
    FreeLibrary( h->library );
    free( h );
    return 0;
}

const cli_input_t avs_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file };
x264-snapshot-20120103-2245-stable/input/y4m.c0000644000175000001440000002061211700673342017706 0ustar  videolanusers/*****************************************************************************
 * y4m.c: y4m input
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "input.h"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "y4m", __VA_ARGS__ )

typedef struct
{
    FILE *fh;
    int next_frame;
    int seq_header_len;
    int frame_header_len;
    uint64_t frame_size;
    uint64_t plane_size[3];
} y4m_hnd_t;

#define Y4M_MAGIC "YUV4MPEG2"
#define MAX_YUV4_HEADER 80
#define Y4M_FRAME_MAGIC "FRAME"
#define MAX_FRAME_HEADER 80

static int csp_string_to_int( char *csp_name )
{
    int csp = X264_CSP_MAX;
    if( !strncmp( "420", csp_name, 3 ) )
        csp = X264_CSP_I420;
    else if( !strncmp( "422", csp_name, 3 ) )
        csp = X264_CSP_I422;
    else if( !strncmp( "444", csp_name, 3 ) && strncmp( "444alpha", csp_name, 8 ) ) // only accept alphaless 4:4:4
        csp = X264_CSP_I444;
    return csp;
}

static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
    y4m_hnd_t *h = malloc( sizeof(y4m_hnd_t) );
    int i;
    uint32_t n, d;
    char header[MAX_YUV4_HEADER+10];
    char *tokend, *header_end;
    int colorspace = X264_CSP_NONE;
    int alt_colorspace = X264_CSP_NONE;
    if( !h )
        return -1;

    h->next_frame = 0;
    info->vfr = 0;

    if( !strcmp( psz_filename, "-" ) )
        h->fh = stdin;
    else
        h->fh = fopen(psz_filename, "rb");
    if( h->fh == NULL )
        return -1;

    h->frame_header_len = strlen( Y4M_FRAME_MAGIC )+1;

    /* Read header */
    for( i = 0; i < MAX_YUV4_HEADER; i++ )
    {
        header[i] = fgetc( h->fh );
        if( header[i] == '\n' )
        {
            /* Add a space after last option. Makes parsing "444" vs
               "444alpha" easier. */
            header[i+1] = 0x20;
            header[i+2] = 0;
            break;
        }
    }
    if( i == MAX_YUV4_HEADER || strncmp( header, Y4M_MAGIC, strlen( Y4M_MAGIC ) ) )
        return -1;

    /* Scan properties */
    header_end = &header[i+1]; /* Include space */
    h->seq_header_len = i+1;
    for( char *tokstart = &header[strlen( Y4M_MAGIC )+1]; tokstart < header_end; tokstart++ )
    {
        if( *tokstart == 0x20 )
            continue;
        switch( *tokstart++ )
        {
            case 'W': /* Width. Required. */
                info->width = strtol( tokstart, &tokend, 10 );
                tokstart=tokend;
                break;
            case 'H': /* Height. Required. */
                info->height = strtol( tokstart, &tokend, 10 );
                tokstart=tokend;
                break;
            case 'C': /* Color space */
                colorspace = csp_string_to_int( tokstart );
                tokstart = strchr( tokstart, 0x20 );
                break;
            case 'I': /* Interlace type */
                switch( *tokstart++ )
                {
                    case 't':
                        info->interlaced = 1;
                        info->tff = 1;
                        break;
                    case 'b':
                        info->interlaced = 1;
                        info->tff = 0;
                        break;
                    case 'm':
                        info->interlaced = 1;
                        break;
                    //case '?':
                    //case 'p':
                    default:
                        break;
                }
                break;
            case 'F': /* Frame rate - 0:0 if unknown */
                if( sscanf( tokstart, "%u:%u", &n, &d ) == 2 && n && d )
                {
                    x264_reduce_fraction( &n, &d );
                    info->fps_num = n;
                    info->fps_den = d;
                }
                tokstart = strchr( tokstart, 0x20 );
                break;
            case 'A': /* Pixel aspect - 0:0 if unknown */
                /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */
                if( sscanf( tokstart, "%u:%u", &n, &d ) == 2 && n && d )
                {
                    x264_reduce_fraction( &n, &d );
                    info->sar_width  = n;
                    info->sar_height = d;
                }
                tokstart = strchr( tokstart, 0x20 );
                break;
            case 'X': /* Vendor extensions */
                if( !strncmp( "YSCSS=", tokstart, 6 ) )
                {
                    /* Older nonstandard pixel format representation */
                    tokstart += 6;
                    alt_colorspace = csp_string_to_int( tokstart );
                }
                tokstart = strchr( tokstart, 0x20 );
                break;
        }
    }

    if( colorspace == X264_CSP_NONE )
        colorspace = alt_colorspace;

    // default to 4:2:0 if nothing is specified
    if( colorspace == X264_CSP_NONE )
        colorspace = X264_CSP_I420;

    FAIL_IF_ERROR( colorspace <= X264_CSP_NONE && colorspace >= X264_CSP_MAX, "colorspace unhandled\n" )

    info->thread_safe = 1;
    info->num_frames  = 0;
    info->csp         = colorspace;
    h->frame_size     = h->frame_header_len;
    for( i = 0; i < x264_cli_csps[info->csp].planes; i++ )
    {
        h->plane_size[i] = x264_cli_pic_plane_size( info->csp, info->width, info->height, i );
        h->frame_size += h->plane_size[i];
    }

    /* Most common case: frame_header = "FRAME" */
    if( x264_is_regular_file( h->fh ) )
    {
        uint64_t init_pos = ftell( h->fh );
        fseek( h->fh, 0, SEEK_END );
        uint64_t i_size = ftell( h->fh );
        fseek( h->fh, init_pos, SEEK_SET );
        info->num_frames = (i_size - h->seq_header_len) / h->frame_size;
    }

    *p_handle = h;
    return 0;
}

static int read_frame_internal( cli_pic_t *pic, y4m_hnd_t *h )
{
    size_t slen = strlen( Y4M_FRAME_MAGIC );
    int i = 0;
    char header[16];

    /* Read frame header - without terminating '\n' */
    if( fread( header, 1, slen, h->fh ) != slen )
        return -1;

    header[slen] = 0;
    FAIL_IF_ERROR( strncmp( header, Y4M_FRAME_MAGIC, slen ), "bad header magic (%"PRIx32" <=> %s)\n",
                   M32(header), header )

    /* Skip most of it */
    while( i < MAX_FRAME_HEADER && fgetc( h->fh ) != '\n' )
        i++;
    FAIL_IF_ERROR( i == MAX_FRAME_HEADER, "bad frame header!\n" )
    h->frame_size = h->frame_size - h->frame_header_len + i+slen+1;
    h->frame_header_len = i+slen+1;

    int error = 0;
    for( i = 0; i < pic->img.planes && !error; i++ )
        error |= fread( pic->img.plane[i], h->plane_size[i], 1, h->fh ) <= 0;
    return error;
}

static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
{
    y4m_hnd_t *h = handle;

    if( i_frame > h->next_frame )
    {
        if( x264_is_regular_file( h->fh ) )
            fseek( h->fh, h->frame_size * i_frame + h->seq_header_len, SEEK_SET );
        else
            while( i_frame > h->next_frame )
            {
                if( read_frame_internal( pic, h ) )
                    return -1;
                h->next_frame++;
            }
    }

    if( read_frame_internal( pic, h ) )
        return -1;

    h->next_frame = i_frame+1;
    return 0;
}

static int close_file( hnd_t handle )
{
    y4m_hnd_t *h = handle;
    if( !h || !h->fh )
        return 0;
    fclose( h->fh );
    free( h );
    return 0;
}

const cli_input_t y4m_input = { open_file, x264_cli_pic_alloc, read_frame, NULL, x264_cli_pic_clean, close_file };
x264-snapshot-20120103-2245-stable/input/timecode.c0000644000175000001440000004065111700673342020773 0ustar  videolanusers/*****************************************************************************
 * timecode.c: timecode file input
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Yusuke Nakamura <muken.the.vfrmaniac@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "input.h"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "timecode", __VA_ARGS__ )

typedef struct
{
    cli_input_t input;
    hnd_t p_handle;
    int auto_timebase_num;
    int auto_timebase_den;
    uint64_t timebase_num;
    uint64_t timebase_den;
    int stored_pts_num;
    int64_t *pts;
    double assume_fps;
    double last_timecode;
} timecode_hnd_t;

static inline double sigexp10( double value, double *exponent )
{
    /* This function separates significand and exp10 from double floating point. */
    *exponent = pow( 10, floor( log10( value ) ) );
    return value / *exponent;
}

#define DOUBLE_EPSILON 5e-6
#define MKV_TIMEBASE_DEN 1000000000

static double correct_fps( double fps, timecode_hnd_t *h )
{
    int i = 1;
    uint64_t fps_num, fps_den;
    double exponent;
    double fps_sig = sigexp10( fps, &exponent );
    while( 1 )
    {
        fps_den = i * h->timebase_num;
        fps_num = round( fps_den * fps_sig ) * exponent;
        FAIL_IF_ERROR( fps_num > UINT32_MAX, "tcfile fps correction failed.\n"
                       "                  Specify an appropriate timebase manually or remake tcfile.\n" )
        if( fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
            break;
        ++i;
    }
    if( h->auto_timebase_den )
    {
        h->timebase_den = h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
        if( h->timebase_den > UINT32_MAX )
            h->auto_timebase_den = 0;
    }
    return (double)fps_num / fps_den;
}

static int try_mkv_timebase_den( double *fpss, timecode_hnd_t *h, int loop_num )
{
    h->timebase_num = 0;
    h->timebase_den = MKV_TIMEBASE_DEN;
    for( int num = 0; num < loop_num; num++ )
    {
        uint64_t fps_den;
        double exponent;
        double fps_sig = sigexp10( fpss[num], &exponent );
        fps_den = round( MKV_TIMEBASE_DEN / fps_sig ) / exponent;
        h->timebase_num = fps_den && h->timebase_num ? gcd( h->timebase_num, fps_den ) : fps_den;
        FAIL_IF_ERROR( h->timebase_num > UINT32_MAX || !h->timebase_num, "automatic timebase generation failed.\n"
                       "                  Specify timebase manually.\n" )
    }
    return 0;
}

static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info )
{
    char buff[256];
    int ret, tcfv, num, seq_num, timecodes_num;
    double *timecodes = NULL;
    double *fpss = NULL;

    ret = fscanf( tcfile_in, "# timecode format v%d", &tcfv );
    FAIL_IF_ERROR( ret != 1 || (tcfv != 1 && tcfv != 2), "unsupported timecode format\n" )
#define NO_TIMECODE_LINE (buff[0] == '#' || buff[0] == '\n' || buff[0] == '\r')
    if( tcfv == 1 )
    {
        uint64_t file_pos;
        double assume_fps, seq_fps;
        int start, end = -1;
        int prev_start = -1, prev_end = -1;

        h->assume_fps = 0;
        for( num = 2; fgets( buff, sizeof(buff), tcfile_in ) != NULL; num++ )
        {
            if( NO_TIMECODE_LINE )
                continue;
            FAIL_IF_ERROR( sscanf( buff, "assume %lf", &h->assume_fps ) != 1 && sscanf( buff, "Assume %lf", &h->assume_fps ) != 1,
                           "tcfile parsing error: assumed fps not found\n" )
            break;
        }
        FAIL_IF_ERROR( h->assume_fps <= 0, "invalid assumed fps %.6f\n", h->assume_fps )

        file_pos = ftell( tcfile_in );
        h->stored_pts_num = 0;
        for( seq_num = 0; fgets( buff, sizeof(buff), tcfile_in ) != NULL; num++ )
        {
            if( NO_TIMECODE_LINE )
            {
                if( sscanf( buff, "# TDecimate Mode 3:  Last Frame = %d", &end ) == 1 )
                    h->stored_pts_num = end + 1;
                continue;
            }
            ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
            FAIL_IF_ERROR( ret != 3 && ret != EOF, "invalid input tcfile\n" )
            FAIL_IF_ERROR( start > end || start <= prev_start || end <= prev_end || seq_fps <= 0,
                           "invalid input tcfile at line %d: %s\n", num, buff )
            prev_start = start;
            prev_end = end;
            if( h->auto_timebase_den || h->auto_timebase_num )
                ++seq_num;
        }
        if( !h->stored_pts_num )
            h->stored_pts_num = end + 2;
        timecodes_num = h->stored_pts_num;
        fseek( tcfile_in, file_pos, SEEK_SET );

        timecodes = malloc( timecodes_num * sizeof(double) );
        if( !timecodes )
            return -1;
        if( h->auto_timebase_den || h->auto_timebase_num )
        {
            fpss = malloc( (seq_num + 1) * sizeof(double) );
            if( !fpss )
                goto fail;
        }

        assume_fps = correct_fps( h->assume_fps, h );
        if( assume_fps < 0 )
            goto fail;
        timecodes[0] = 0;
        for( num = seq_num = 0; num < timecodes_num - 1 && fgets( buff, sizeof(buff), tcfile_in ) != NULL; )
        {
            if( NO_TIMECODE_LINE )
                continue;
            ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
            if( ret != 3 )
                start = end = timecodes_num - 1;
            for( ; num < start && num < timecodes_num - 1; num++ )
                timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
            if( num < timecodes_num - 1 )
            {
                if( h->auto_timebase_den || h->auto_timebase_num )
                    fpss[seq_num++] = seq_fps;
                seq_fps = correct_fps( seq_fps, h );
                if( seq_fps < 0 )
                    goto fail;
                for( num = start; num <= end && num < timecodes_num - 1; num++ )
                    timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
            }
        }
        for( ; num < timecodes_num - 1; num++ )
            timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
        if( h->auto_timebase_den || h->auto_timebase_num )
            fpss[seq_num] = h->assume_fps;

        if( h->auto_timebase_num && !h->auto_timebase_den )
        {
            double exponent;
            double assume_fps_sig, seq_fps_sig;
            if( try_mkv_timebase_den( fpss, h, seq_num + 1 ) < 0 )
                goto fail;
            fseek( tcfile_in, file_pos, SEEK_SET );
            assume_fps_sig = sigexp10( h->assume_fps, &exponent );
            assume_fps = MKV_TIMEBASE_DEN / ( round( MKV_TIMEBASE_DEN / assume_fps_sig ) / exponent );
            for( num = 0; num < timecodes_num - 1 && fgets( buff, sizeof(buff), tcfile_in ) != NULL; )
            {
                if( NO_TIMECODE_LINE )
                    continue;
                ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
                if( ret != 3 )
                    start = end = timecodes_num - 1;
                seq_fps_sig = sigexp10( seq_fps, &exponent );
                seq_fps = MKV_TIMEBASE_DEN / ( round( MKV_TIMEBASE_DEN / seq_fps_sig ) / exponent );
                for( ; num < start && num < timecodes_num - 1; num++ )
                    timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
                for( num = start; num <= end && num < timecodes_num - 1; num++ )
                    timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
            }
            for( ; num < timecodes_num - 1; num++ )
                timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
        }
        if( fpss )
        {
            free( fpss );
            fpss = NULL;
        }

        h->assume_fps = assume_fps;
        h->last_timecode = timecodes[timecodes_num - 1];
    }
    else    /* tcfv == 2 */
    {
        uint64_t file_pos = ftell( tcfile_in );

        h->stored_pts_num = 0;
        while( fgets( buff, sizeof(buff), tcfile_in ) != NULL )
        {
            if( NO_TIMECODE_LINE )
            {
                if( !h->stored_pts_num )
                    file_pos = ftell( tcfile_in );
                continue;
            }
            h->stored_pts_num++;
        }
        timecodes_num = h->stored_pts_num;
        FAIL_IF_ERROR( !timecodes_num, "input tcfile doesn't have any timecodes!\n" )
        fseek( tcfile_in, file_pos, SEEK_SET );

        timecodes = malloc( timecodes_num * sizeof(double) );
        if( !timecodes )
            return -1;

        num = 0;
        if( fgets( buff, sizeof(buff), tcfile_in ) != NULL )
        {
            ret = sscanf( buff, "%lf", &timecodes[0] );
            timecodes[0] *= 1e-3;         /* Timecode format v2 is expressed in milliseconds. */
            FAIL_IF_ERROR( ret != 1, "invalid input tcfile for frame 0\n" )
            for( num = 1; num < timecodes_num && fgets( buff, sizeof(buff), tcfile_in ) != NULL; )
            {
                if( NO_TIMECODE_LINE )
                    continue;
                ret = sscanf( buff, "%lf", &timecodes[num] );
                timecodes[num] *= 1e-3;         /* Timecode format v2 is expressed in milliseconds. */
                FAIL_IF_ERROR( ret != 1 || timecodes[num] <= timecodes[num - 1],
                               "invalid input tcfile for frame %d\n", num )
                ++num;
            }
        }
        FAIL_IF_ERROR( num < timecodes_num, "failed to read input tcfile for frame %d", num )

        if( timecodes_num == 1 )
            h->timebase_den = info->fps_num;
        else if( h->auto_timebase_den )
        {
            fpss = malloc( (timecodes_num - 1) * sizeof(double) );
            if( !fpss )
                goto fail;
            for( num = 0; num < timecodes_num - 1; num++ )
            {
                fpss[num] = 1 / (timecodes[num + 1] - timecodes[num]);
                if( h->auto_timebase_den )
                {
                    int i = 1;
                    uint64_t fps_num, fps_den;
                    double exponent;
                    double fps_sig = sigexp10( fpss[num], &exponent );
                    while( 1 )
                    {
                        fps_den = i * h->timebase_num;
                        fps_num = round( fps_den * fps_sig ) * exponent;
                        if( fps_num > UINT32_MAX || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
                            break;
                        ++i;
                    }
                    h->timebase_den = fps_num && h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
                    if( h->timebase_den > UINT32_MAX )
                    {
                        h->auto_timebase_den = 0;
                        continue;
                    }
                }
            }
            if( h->auto_timebase_num && !h->auto_timebase_den )
                if( try_mkv_timebase_den( fpss, h, timecodes_num - 1 ) < 0 )
                    goto fail;
            free( fpss );
            fpss = NULL;
        }

        if( timecodes_num > 1 )
            h->assume_fps = 1 / (timecodes[timecodes_num - 1] - timecodes[timecodes_num - 2]);
        else
            h->assume_fps = (double)info->fps_num / info->fps_den;
        h->last_timecode = timecodes[timecodes_num - 1];
    }
#undef NO_TIMECODE_LINE
    if( h->auto_timebase_den || h->auto_timebase_num )
    {
        uint64_t i = gcd( h->timebase_num, h->timebase_den );
        h->timebase_num /= i;
        h->timebase_den /= i;
        x264_cli_log( "timecode", X264_LOG_INFO, "automatic timebase generation %"PRIu64"/%"PRIu64"\n", h->timebase_num, h->timebase_den );
    }
    else FAIL_IF_ERROR( h->timebase_den > UINT32_MAX || !h->timebase_den, "automatic timebase generation failed.\n"
                        "                  Specify an appropriate timebase manually.\n" )

    h->pts = malloc( h->stored_pts_num * sizeof(int64_t) );
    if( !h->pts )
        goto fail;
    for( num = 0; num < h->stored_pts_num; num++ )
    {
        h->pts[num] = timecodes[num] * ((double)h->timebase_den / h->timebase_num) + 0.5;
        FAIL_IF_ERROR( num > 0 && h->pts[num] <= h->pts[num - 1], "invalid timebase or timecode for frame %d\n", num )
    }

    free( timecodes );
    return 0;

fail:
    if( timecodes )
        free( timecodes );
    if( fpss )
        free( fpss );
    return -1;
}

#undef DOUBLE_EPSILON
#undef MKV_TIMEBASE_DEN

static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
    int ret = 0;
    FILE *tcfile_in;
    timecode_hnd_t *h = malloc( sizeof(timecode_hnd_t) );
    FAIL_IF_ERROR( !h, "malloc failed\n" )
    h->input = cli_input;
    h->p_handle = *p_handle;
    h->pts = NULL;
    if( opt->timebase )
    {
        ret = sscanf( opt->timebase, "%"SCNu64"/%"SCNu64, &h->timebase_num, &h->timebase_den );
        if( ret == 1 )
        {
            h->timebase_num = strtoul( opt->timebase, NULL, 10 );
            h->timebase_den = 0; /* set later by auto timebase generation */
        }
        FAIL_IF_ERROR( h->timebase_num > UINT32_MAX || h->timebase_den > UINT32_MAX,
                       "timebase you specified exceeds H.264 maximum\n" )
    }
    h->auto_timebase_num = !ret;
    h->auto_timebase_den = ret < 2;
    if( h->auto_timebase_num )
        h->timebase_num = info->fps_den; /* can be changed later by auto timebase generation */
    if( h->auto_timebase_den )
        h->timebase_den = 0;             /* set later by auto timebase generation */
    timecode_input.picture_alloc = h->input.picture_alloc;
    timecode_input.picture_clean = h->input.picture_clean;

    tcfile_in = fopen( psz_filename, "rb" );
    FAIL_IF_ERROR( !tcfile_in, "can't open `%s'\n", psz_filename )
    else if( !x264_is_regular_file( tcfile_in ) )
    {
        x264_cli_log( "timecode", X264_LOG_ERROR, "tcfile input incompatible with non-regular file `%s'\n", psz_filename );
        fclose( tcfile_in );
        return -1;
    }

    if( parse_tcfile( tcfile_in, h, info ) < 0 )
    {
        if( h->pts )
            free( h->pts );
        fclose( tcfile_in );
        return -1;
    }
    fclose( tcfile_in );

    info->timebase_num = h->timebase_num;
    info->timebase_den = h->timebase_den;
    info->vfr = 1;

    *p_handle = h;
    return 0;
}

static int64_t get_frame_pts( timecode_hnd_t *h, int frame, int real_frame )
{
    if( frame < h->stored_pts_num )
        return h->pts[frame];
    else
    {
        if( h->pts && real_frame )
        {
            x264_cli_log( "timecode", X264_LOG_INFO, "input timecode file missing data for frame %d and later\n"
                          "                 assuming constant fps %.6f\n", frame, h->assume_fps );
            free( h->pts );
            h->pts = NULL;
        }
        double timecode = h->last_timecode + 1 / h->assume_fps;
        if( real_frame )
            h->last_timecode = timecode;
        return timecode * ((double)h->timebase_den / h->timebase_num) + 0.5;
    }
}

static int read_frame( cli_pic_t *pic, hnd_t handle, int frame )
{
    timecode_hnd_t *h = handle;
    if( h->input.read_frame( pic, h->p_handle, frame ) )
        return -1;

    pic->pts = get_frame_pts( h, frame, 1 );
    pic->duration = get_frame_pts( h, frame + 1, 0 ) - pic->pts;

    return 0;
}

static int release_frame( cli_pic_t *pic, hnd_t handle )
{
    timecode_hnd_t *h = handle;
    if( h->input.release_frame )
        return h->input.release_frame( pic, h->p_handle );
    return 0;
}

static int close_file( hnd_t handle )
{
    timecode_hnd_t *h = handle;
    if( h->pts )
        free( h->pts );
    h->input.close_file( h->p_handle );
    free( h );
    return 0;
}

cli_input_t timecode_input = { open_file, NULL, read_frame, release_frame, NULL, close_file };
x264-snapshot-20120103-2245-stable/input/thread.c0000644000175000001440000000736011700673342020451 0ustar  videolanusers/*****************************************************************************
 * thread.c: threaded input
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "input.h"

typedef struct
{
    cli_input_t input;
    hnd_t p_handle;
    cli_pic_t pic;
    x264_threadpool_t *pool;
    int next_frame;
    int frame_total;
    struct thread_input_arg_t *next_args;
} thread_hnd_t;

typedef struct thread_input_arg_t
{
    thread_hnd_t *h;
    cli_pic_t *pic;
    int i_frame;
    int status;
} thread_input_arg_t;

static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
    thread_hnd_t *h = malloc( sizeof(thread_hnd_t) );
    FAIL_IF_ERR( !h || cli_input.picture_alloc( &h->pic, info->csp, info->width, info->height ),
                 "x264", "malloc failed\n" )
    h->input = cli_input;
    h->p_handle = *p_handle;
    h->next_frame = -1;
    h->next_args = malloc( sizeof(thread_input_arg_t) );
    if( !h->next_args )
        return -1;
    h->next_args->h = h;
    h->next_args->status = 0;
    h->frame_total = info->num_frames;
    thread_input.picture_alloc = h->input.picture_alloc;
    thread_input.picture_clean = h->input.picture_clean;

    if( x264_threadpool_init( &h->pool, 1, NULL, NULL ) )
        return -1;

    *p_handle = h;
    return 0;
}

static void read_frame_thread_int( thread_input_arg_t *i )
{
    i->status = i->h->input.read_frame( i->pic, i->h->p_handle, i->i_frame );
}

static int read_frame( cli_pic_t *p_pic, hnd_t handle, int i_frame )
{
    thread_hnd_t *h = handle;
    int ret = 0;

    if( h->next_frame >= 0 )
    {
        x264_threadpool_wait( h->pool, h->next_args );
        ret |= h->next_args->status;
    }

    if( h->next_frame == i_frame )
        XCHG( cli_pic_t, *p_pic, h->pic );
    else
        ret |= h->input.read_frame( p_pic, h->p_handle, i_frame );

    if( !h->frame_total || i_frame+1 < h->frame_total )
    {
        h->next_frame =
        h->next_args->i_frame = i_frame+1;
        h->next_args->pic = &h->pic;
        x264_threadpool_run( h->pool, (void*)read_frame_thread_int, h->next_args );
    }
    else
        h->next_frame = -1;

    return ret;
}

static int release_frame( cli_pic_t *pic, hnd_t handle )
{
    thread_hnd_t *h = handle;
    if( h->input.release_frame )
        return h->input.release_frame( pic, h->p_handle );
    return 0;
}

static int close_file( hnd_t handle )
{
    thread_hnd_t *h = handle;
    x264_threadpool_delete( h->pool );
    h->input.close_file( h->p_handle );
    h->input.picture_clean( &h->pic );
    free( h->next_args );
    free( h );
    return 0;
}

cli_input_t thread_input = { open_file, NULL, read_frame, release_frame, NULL, close_file };
x264-snapshot-20120103-2245-stable/input/raw.c0000644000175000001440000001233211700673342017766 0ustar  videolanusers/*****************************************************************************
 * raw.c: raw input
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "input.h"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "raw", __VA_ARGS__ )

typedef struct
{
    FILE *fh;
    int next_frame;
    uint64_t plane_size[4];
    uint64_t frame_size;
    int bit_depth;
} raw_hnd_t;

static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
    raw_hnd_t *h = calloc( 1, sizeof(raw_hnd_t) );
    if( !h )
        return -1;

    if( !opt->resolution )
    {
        /* try to parse the file name */
        for( char *p = psz_filename; *p; p++ )
            if( *p >= '0' && *p <= '9' && sscanf( p, "%dx%d", &info->width, &info->height ) == 2 )
                break;
    }
    else
        sscanf( opt->resolution, "%dx%d", &info->width, &info->height );
    FAIL_IF_ERROR( !info->width || !info->height, "raw input requires a resolution.\n" )
    if( opt->colorspace )
    {
        for( info->csp = X264_CSP_CLI_MAX-1; x264_cli_csps[info->csp].name && strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ); )
            info->csp--;
        FAIL_IF_ERROR( info->csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", opt->colorspace );
    }
    else /* default */
        info->csp = X264_CSP_I420;

    h->bit_depth = opt->bit_depth;
    FAIL_IF_ERROR( h->bit_depth < 8 || h->bit_depth > 16, "unsupported bit depth `%d'\n", h->bit_depth );
    if( h->bit_depth > 8 )
        info->csp |= X264_CSP_HIGH_DEPTH;

    if( !strcmp( psz_filename, "-" ) )
        h->fh = stdin;
    else
        h->fh = fopen( psz_filename, "rb" );
    if( h->fh == NULL )
        return -1;

    info->thread_safe = 1;
    info->num_frames  = 0;
    info->vfr         = 0;

    const x264_cli_csp_t *csp = x264_cli_get_csp( info->csp );
    for( int i = 0; i < csp->planes; i++ )
    {
        h->plane_size[i] = x264_cli_pic_plane_size( info->csp, info->width, info->height, i );
        h->frame_size += h->plane_size[i];
        /* x264_cli_pic_plane_size returns the size in bytes, we need the value in pixels from here on */
        h->plane_size[i] /= x264_cli_csp_depth_factor( info->csp );
    }

    if( x264_is_regular_file( h->fh ) )
    {
        fseek( h->fh, 0, SEEK_END );
        uint64_t size = ftell( h->fh );
        fseek( h->fh, 0, SEEK_SET );
        info->num_frames = size / h->frame_size;
    }

    *p_handle = h;
    return 0;
}

static int read_frame_internal( cli_pic_t *pic, raw_hnd_t *h )
{
    int error = 0;
    int pixel_depth = x264_cli_csp_depth_factor( pic->img.csp );
    for( int i = 0; i < pic->img.planes && !error; i++ )
    {
        error |= fread( pic->img.plane[i], pixel_depth, h->plane_size[i], h->fh ) != h->plane_size[i];
        if( h->bit_depth & 7 )
        {
            /* upconvert non 16bit high depth planes to 16bit using the same
             * algorithm as used in the depth filter. */
            uint16_t *plane = (uint16_t*)pic->img.plane[i];
            uint64_t pixel_count = h->plane_size[i];
            int lshift = 16 - h->bit_depth;
            int rshift = 2*h->bit_depth - 16;
            for( uint64_t j = 0; j < pixel_count; j++ )
                plane[j] = (plane[j] << lshift) + (plane[j] >> rshift);
        }
    }
    return error;
}

static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
{
    raw_hnd_t *h = handle;

    if( i_frame > h->next_frame )
    {
        if( x264_is_regular_file( h->fh ) )
            fseek( h->fh, i_frame * h->frame_size, SEEK_SET );
        else
            while( i_frame > h->next_frame )
            {
                if( read_frame_internal( pic, h ) )
                    return -1;
                h->next_frame++;
            }
    }

    if( read_frame_internal( pic, h ) )
        return -1;

    h->next_frame = i_frame+1;
    return 0;
}

static int close_file( hnd_t handle )
{
    raw_hnd_t *h = handle;
    if( !h || !h->fh )
        return 0;
    fclose( h->fh );
    free( h );
    return 0;
}

const cli_input_t raw_input = { open_file, x264_cli_pic_alloc, read_frame, NULL, x264_cli_pic_clean, close_file };
x264-snapshot-20120103-2245-stable/input/input.c0000644000175000001440000001022011700673342020326 0ustar  videolanusers/*****************************************************************************
 * input.c: common input functions
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "input.h"

const x264_cli_csp_t x264_cli_csps[] = {
    [X264_CSP_I420] = { "i420", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
    [X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
    [X264_CSP_I444] = { "i444", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
    [X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
    [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
    [X264_CSP_YV24] = { "yv24", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
    [X264_CSP_NV12] = { "nv12", 2, { 1,  1 },     { 1, .5 },     2, 2 },
    [X264_CSP_NV16] = { "nv16", 2, { 1,  1 },     { 1,  1 },     2, 1 },
    [X264_CSP_BGR]  = { "bgr",  1, { 3 },         { 1 },         1, 1 },
    [X264_CSP_BGRA] = { "bgra", 1, { 4 },         { 1 },         1, 1 },
    [X264_CSP_RGB]  = { "rgb",  1, { 3 },         { 1 },         1, 1 },
};

int x264_cli_csp_is_invalid( int csp )
{
    int csp_mask = csp & X264_CSP_MASK;
    return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX || csp & X264_CSP_OTHER;
}

int x264_cli_csp_depth_factor( int csp )
{
    if( x264_cli_csp_is_invalid( csp ) )
        return 0;
    return (csp & X264_CSP_HIGH_DEPTH) ? 2 : 1;
}

uint64_t x264_cli_pic_plane_size( int csp, int width, int height, int plane )
{
    int csp_mask = csp & X264_CSP_MASK;
    if( x264_cli_csp_is_invalid( csp ) || plane < 0 || plane >= x264_cli_csps[csp_mask].planes )
        return 0;
    uint64_t size = (uint64_t)width * height;
    size *= x264_cli_csps[csp_mask].width[plane] * x264_cli_csps[csp_mask].height[plane];
    size *= x264_cli_csp_depth_factor( csp );
    return size;
}

uint64_t x264_cli_pic_size( int csp, int width, int height )
{
    if( x264_cli_csp_is_invalid( csp ) )
        return 0;
    uint64_t size = 0;
    int csp_mask = csp & X264_CSP_MASK;
    for( int i = 0; i < x264_cli_csps[csp_mask].planes; i++ )
        size += x264_cli_pic_plane_size( csp, width, height, i );
    return size;
}

int x264_cli_pic_alloc( cli_pic_t *pic, int csp, int width, int height )
{
    memset( pic, 0, sizeof(cli_pic_t) );
    int csp_mask = csp & X264_CSP_MASK;
    if( x264_cli_csp_is_invalid( csp ) )
        pic->img.planes = 0;
    else
        pic->img.planes = x264_cli_csps[csp_mask].planes;
    pic->img.csp    = csp;
    pic->img.width  = width;
    pic->img.height = height;
    for( int i = 0; i < pic->img.planes; i++ )
    {
         pic->img.plane[i] = x264_malloc( x264_cli_pic_plane_size( csp, width, height, i ) );
         if( !pic->img.plane[i] )
             return -1;
         pic->img.stride[i] = width * x264_cli_csps[csp_mask].width[i] * x264_cli_csp_depth_factor( csp );
    }

    return 0;
}

void x264_cli_pic_clean( cli_pic_t *pic )
{
    for( int i = 0; i < pic->img.planes; i++ )
        x264_free( pic->img.plane[i] );
    memset( pic, 0, sizeof(cli_pic_t) );
}

const x264_cli_csp_t *x264_cli_get_csp( int csp )
{
    if( x264_cli_csp_is_invalid( csp ) )
        return NULL;
    return x264_cli_csps + (csp&X264_CSP_MASK);
}
x264-snapshot-20120103-2245-stable/filters/0000755000175000001440000000000011700673342017341 5ustar  videolanusersx264-snapshot-20120103-2245-stable/filters/video/0000755000175000001440000000000011700673342020447 5ustar  videolanusersx264-snapshot-20120103-2245-stable/filters/video/resize.c0000644000175000001440000005504711700673342022127 0ustar  videolanusers/*****************************************************************************
 * resize.c: resize video filter
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "video.h"
#define NAME "resize"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )

cli_vid_filter_t resize_filter;

static int full_check( video_info_t *info, x264_param_t *param )
{
    int required = 0;
    required |= info->csp    != param->i_csp;
    required |= info->width  != param->i_width;
    required |= info->height != param->i_height;
    return required;
}

#if HAVE_SWSCALE
#undef DECLARE_ALIGNED
#include <libswscale/swscale.h>
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>

typedef struct
{
    int width;
    int height;
    int pix_fmt;
} frame_prop_t;

typedef struct
{
    hnd_t prev_hnd;
    cli_vid_filter_t prev_filter;

    cli_pic_t buffer;
    int buffer_allocated;
    int dst_csp;
    struct SwsContext *ctx;
    uint32_t ctx_flags;
    /* state of swapping chroma planes pre and post resize */
    int pre_swap_chroma;
    int post_swap_chroma;
    int variable_input; /* input is capable of changing properties */
    int working;        /* we have already started working with frames */
    frame_prop_t dst;   /* desired output properties */
    frame_prop_t scale; /* properties of the SwsContext input */
} resizer_hnd_t;

static void help( int longhelp )
{
    printf( "      "NAME":[width,height][,sar][,fittobox][,csp][,method]\n" );
    if( !longhelp )
        return;
    printf( "            resizes frames based on the given criteria:\n"
            "            - resolution only: resizes and adapts sar to avoid stretching\n"
            "            - sar only: sets the sar and resizes to avoid stretching\n"
            "            - resolution and sar: resizes to given resolution and sets the sar\n"
            "            - fittobox: resizes the video based on the desired constraints\n"
            "               - width, height, both\n"
            "            - fittobox and sar: same as above except with specified sar\n"
            "            - csp: convert to the given csp. syntax: [name][:depth]\n"
            "               - valid csp names [keep current]: " );

    for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
    {
        printf( "%s", x264_cli_csps[i].name );
        if( i+1 < X264_CSP_CLI_MAX )
            printf( ", " );
    }
    printf( "\n"
            "               - depth: 8 or 16 bits per pixel [keep current]\n"
            "            note: not all depths are supported by all csps.\n"
            "            - method: use resizer method [\"bicubic\"]\n"
            "               - fastbilinear, bilinear, bicubic, experimental, point,\n"
            "               - area, bicublin, gauss, sinc, lanczos, spline\n" );
}

static uint32_t convert_method_to_flag( const char *name )
{
    uint32_t flag = 0;
    if( !strcasecmp( name, "fastbilinear" ) )
        flag = SWS_FAST_BILINEAR;
    else if( !strcasecmp( name, "bilinear" ) )
        flag = SWS_BILINEAR;
    else if( !strcasecmp( name, "bicubic" ) )
        flag = SWS_BICUBIC;
    else if( !strcasecmp( name, "experimental" ) )
        flag = SWS_X;
    else if( !strcasecmp( name, "point" ) )
        flag = SWS_POINT;
    else if( !strcasecmp( name, "area" ) )
        flag = SWS_AREA;
    else if( !strcasecmp( name, "bicublin" ) )
        flag = SWS_BICUBLIN;
    else if( !strcasecmp( name, "guass" ) )
        flag = SWS_GAUSS;
    else if( !strcasecmp( name, "sinc" ) )
        flag = SWS_SINC;
    else if( !strcasecmp( name, "lanczos" ) )
        flag = SWS_LANCZOS;
    else if( !strcasecmp( name, "spline" ) )
        flag = SWS_SPLINE;
    else // default
        flag = SWS_BICUBIC;
    return flag;
}

static int convert_csp_to_pix_fmt( int csp )
{
    if( csp&X264_CSP_OTHER )
        return csp&X264_CSP_MASK;
    switch( csp&X264_CSP_MASK )
    {
        case X264_CSP_YV12: /* specially handled via swapping chroma */
        case X264_CSP_I420: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV420P16 : PIX_FMT_YUV420P;
        case X264_CSP_YV16: /* specially handled via swapping chroma */
        case X264_CSP_I422: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV422P16 : PIX_FMT_YUV422P;
        case X264_CSP_YV24: /* specially handled via swapping chroma */
        case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV444P16 : PIX_FMT_YUV444P;
        case X264_CSP_RGB:  return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_RGB48     : PIX_FMT_RGB24;
        /* the next 3 csps have no equivalent 16bit depth in swscale */
        case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE      : PIX_FMT_NV12;
        case X264_CSP_BGR:  return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE      : PIX_FMT_BGR24;
        case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE      : PIX_FMT_BGRA;
        default:            return PIX_FMT_NONE;
    }
}

static int pick_closest_supported_csp( int csp )
{
    int pix_fmt = convert_csp_to_pix_fmt( csp );
    switch( pix_fmt )
    {
        case PIX_FMT_YUV420P16LE:
        case PIX_FMT_YUV420P16BE:
            return X264_CSP_I420 | X264_CSP_HIGH_DEPTH;
        case PIX_FMT_YUV422P:
        case PIX_FMT_YUYV422:
        case PIX_FMT_UYVY422:
        case PIX_FMT_YUVJ422P:
            return X264_CSP_I422;
        case PIX_FMT_YUV422P16LE:
        case PIX_FMT_YUV422P16BE:
            return X264_CSP_I422 | X264_CSP_HIGH_DEPTH;
        case PIX_FMT_YUV444P:
        case PIX_FMT_YUVJ444P:
            return X264_CSP_I444;
        case PIX_FMT_YUV444P16LE:
        case PIX_FMT_YUV444P16BE:
            return X264_CSP_I444 | X264_CSP_HIGH_DEPTH;
        case PIX_FMT_RGB24:
        case PIX_FMT_RGB565BE:
        case PIX_FMT_RGB565LE:
        case PIX_FMT_RGB555BE:
        case PIX_FMT_RGB555LE:
            return X264_CSP_RGB;
        case PIX_FMT_RGB48BE:
        case PIX_FMT_RGB48LE:
            return X264_CSP_RGB | X264_CSP_HIGH_DEPTH;
        case PIX_FMT_BGR24:
        case PIX_FMT_BGR565BE:
        case PIX_FMT_BGR565LE:
        case PIX_FMT_BGR555BE:
        case PIX_FMT_BGR555LE:
            return X264_CSP_BGR;
        case PIX_FMT_ARGB:
        case PIX_FMT_RGBA:
        case PIX_FMT_ABGR:
        case PIX_FMT_BGRA:
            return X264_CSP_BGRA;
        case PIX_FMT_NV12:
        case PIX_FMT_NV21:
             return X264_CSP_NV12;
        default:
            return X264_CSP_I420;
    }
}

static int handle_opts( const char **optlist, char **opts, video_info_t *info, resizer_hnd_t *h )
{
    uint32_t out_sar_w, out_sar_h;

    char *str_width  = x264_get_option( optlist[0], opts );
    char *str_height = x264_get_option( optlist[1], opts );
    char *str_sar    = x264_get_option( optlist[2], opts );
    char *fittobox   = x264_get_option( optlist[3], opts );
    char *str_csp    = x264_get_option( optlist[4], opts );
    int width        = x264_otoi( str_width, -1 );
    int height       = x264_otoi( str_height, -1 );

    int csp_only = 0;
    uint32_t in_sar_w = info->sar_width;
    uint32_t in_sar_h = info->sar_height;

    if( str_csp )
    {
        /* output csp was specified, first check if optional depth was provided */
        char *str_depth = strchr( str_csp, ':' );
        int depth = x264_cli_csp_depth_factor( info->csp ) * 8;
        if( str_depth )
        {
            /* csp bit depth was specified */
            *str_depth++ = '\0';
            depth = x264_otoi( str_depth, -1 );
            FAIL_IF_ERROR( depth != 8 && depth != 16, "unsupported bit depth %d\n", depth );
        }
        /* now lookup against the list of valid csps */
        int csp;
        if( strlen( str_csp ) == 0 )
            csp = info->csp & X264_CSP_MASK;
        else
            for( csp = X264_CSP_CLI_MAX-1; x264_cli_csps[csp].name && strcasecmp( x264_cli_csps[csp].name, str_csp ); )
                csp--;
        FAIL_IF_ERROR( csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", str_csp );
        h->dst_csp = csp;
        if( depth == 16 )
            h->dst_csp |= X264_CSP_HIGH_DEPTH;
    }

    /* if the input sar is currently invalid, set it to 1:1 so it can be used in math */
    if( !in_sar_w || !in_sar_h )
        in_sar_w = in_sar_h = 1;
    if( str_sar )
    {
        FAIL_IF_ERROR( 2 != sscanf( str_sar, "%u:%u", &out_sar_w, &out_sar_h ) &&
                       2 != sscanf( str_sar, "%u/%u", &out_sar_w, &out_sar_h ),
                       "invalid sar `%s'\n", str_sar )
    }
    else
        out_sar_w = out_sar_h = 1;
    if( fittobox )
    {
        /* resize the video to fit the box as much as possible */
        if( !strcasecmp( fittobox, "both" ) )
        {
            FAIL_IF_ERROR( width <= 0 || height <= 0, "invalid box resolution %sx%s\n",
                           x264_otos( str_width, "<unset>" ), x264_otos( str_height, "<unset>" ) )
        }
        else if( !strcasecmp( fittobox, "width" ) )
        {
            FAIL_IF_ERROR( width <= 0, "invalid box width `%s'\n", x264_otos( str_width, "<unset>" ) )
            height = INT_MAX;
        }
        else if( !strcasecmp( fittobox, "height" ) )
        {
            FAIL_IF_ERROR( height <= 0, "invalid box height `%s'\n", x264_otos( str_height, "<unset>" ) )
            width = INT_MAX;
        }
        else FAIL_IF_ERROR( 1, "invalid fittobox mode `%s'\n", fittobox )

        /* maximally fit the new coded resolution to the box */
        const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp );
        double width_units = (double)info->height * in_sar_h * out_sar_w;
        double height_units = (double)info->width * in_sar_w * out_sar_h;
        width = width / csp->mod_width * csp->mod_width;
        height = height / csp->mod_height * csp->mod_height;
        if( width * width_units > height * height_units )
        {
            int new_width = round( height * height_units / (width_units * csp->mod_width) );
            new_width *= csp->mod_width;
            width = X264_MIN( new_width, width );
        }
        else
        {
            int new_height = round( width * width_units / (height_units * csp->mod_height) );
            new_height *= csp->mod_height;
            height = X264_MIN( new_height, height );
        }
    }
    else
    {
        if( str_width || str_height )
        {
            FAIL_IF_ERROR( width <= 0 || height <= 0, "invalid resolution %sx%s\n",
                           x264_otos( str_width, "<unset>" ), x264_otos( str_height, "<unset>" ) )
            if( !str_sar ) /* res only -> adjust sar */
            {
                /* new_sar = (new_h * old_w * old_sar_w) / (old_h * new_w * old_sar_h) */
                uint64_t num = (uint64_t)info->width  * height;
                uint64_t den = (uint64_t)info->height * width;
                x264_reduce_fraction64( &num, &den );
                out_sar_w = num * in_sar_w;
                out_sar_h = den * in_sar_h;
                x264_reduce_fraction( &out_sar_w, &out_sar_h );
            }
        }
        else if( str_sar ) /* sar only -> adjust res */
        {
             const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp );
             double width_units = (double)in_sar_h * out_sar_w;
             double height_units = (double)in_sar_w * out_sar_h;
             width  = info->width;
             height = info->height;
             if( width_units > height_units ) // SAR got wider, decrease width
             {
                 width = round( info->width * height_units / (width_units * csp->mod_width) );
                 width *= csp->mod_width;
             }
             else // SAR got thinner, decrease height
             {
                 height = round( info->height * width_units / (height_units * csp->mod_height) );
                 height *= csp->mod_height;
             }
        }
        else /* csp only */
        {
            h->dst.width  = info->width;
            h->dst.height = info->height;
            csp_only = 1;
        }
    }
    if( !csp_only )
    {
        info->sar_width  = out_sar_w;
        info->sar_height = out_sar_h;
        h->dst.width  = width;
        h->dst.height = height;
    }
    return 0;
}

static int handle_jpeg( int *format )
{
    switch( *format )
    {
        case PIX_FMT_YUVJ420P:
            *format = PIX_FMT_YUV420P;
            return 1;
        case PIX_FMT_YUVJ422P:
            *format = PIX_FMT_YUV422P;
            return 1;
        case PIX_FMT_YUVJ444P:
            *format = PIX_FMT_YUV444P;
            return 1;
        case PIX_FMT_YUVJ440P:
            *format = PIX_FMT_YUV440P;
            return 1;
        default:
            return 0;
    }
}

static int x264_init_sws_context( resizer_hnd_t *h )
{
    if( !h->ctx )
    {
        h->ctx = sws_alloc_context();
        if( !h->ctx )
            return -1;

        /* set flags that will not change */
        int dst_format = h->dst.pix_fmt;
        int dst_range  = handle_jpeg( &dst_format );
        av_set_int( h->ctx, "sws_flags",  h->ctx_flags );
        av_set_int( h->ctx, "dstw",       h->dst.width );
        av_set_int( h->ctx, "dsth",       h->dst.height );
        av_set_int( h->ctx, "dst_format", dst_format );
        av_set_int( h->ctx, "dst_range",  dst_range ); /* FIXME: use the correct full range value */
    }

    int src_format = h->scale.pix_fmt;
    int src_range  = handle_jpeg( &src_format );
    av_set_int( h->ctx, "srcw",       h->scale.width );
    av_set_int( h->ctx, "srch",       h->scale.height );
    av_set_int( h->ctx, "src_format", src_format );
    av_set_int( h->ctx, "src_range",  src_range ); /* FIXME: use the correct full range value */

    /* FIXME: use the correct full range values
     * FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) */
    sws_setColorspaceDetails( h->ctx,
                              sws_getCoefficients( SWS_CS_DEFAULT ), src_range,
                              sws_getCoefficients( SWS_CS_DEFAULT ), av_get_int( h->ctx, "dst_range", NULL ),
                              0, 1<<16, 1<<16 );

    return sws_init_context( h->ctx, NULL, NULL ) < 0;
}

static int check_resizer( resizer_hnd_t *h, cli_pic_t *in )
{
    frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ) };
    if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) )
        return 0;
    /* also warn if the resizer was initialized after the first frame */
    if( h->ctx || h->working )
        x264_cli_log( NAME, X264_LOG_WARNING, "stream properties changed at pts %"PRId64"\n", in->pts );
    h->scale = input_prop;
    if( !h->buffer_allocated )
    {
        if( x264_cli_pic_alloc( &h->buffer, h->dst_csp, h->dst.width, h->dst.height ) )
            return -1;
        h->buffer_allocated = 1;
    }
    FAIL_IF_ERROR( x264_init_sws_context( h ), "swscale init failed\n" )
    return 0;
}

static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
{
    /* if called for normalizing the csp to known formats and the format is not unknown, exit */
    if( opt_string && !strcmp( opt_string, "normcsp" ) && !(info->csp&X264_CSP_OTHER) )
        return 0;
    /* if called by x264cli and nothing needs to be done, exit */
    if( !opt_string && !full_check( info, param ) )
        return 0;

    static const char *optlist[] = { "width", "height", "sar", "fittobox", "csp", "method", NULL };
    char **opts = x264_split_options( opt_string, optlist );
    if( !opts && opt_string )
        return -1;

    resizer_hnd_t *h = calloc( 1, sizeof(resizer_hnd_t) );
    if( !h )
        return -1;
    if( opts )
    {
        h->dst_csp    = info->csp;
        h->dst.width  = info->width;
        h->dst.height = info->height;
        if( !strcmp( opt_string, "normcsp" ) )
        {
            /* only in normalization scenarios is the input capable of changing properties */
            h->variable_input = 1;
            h->dst_csp = pick_closest_supported_csp( info->csp );
            /* now fix the catch-all i420 choice if it does not allow for the current input resolution dimensions. */
            if( h->dst_csp == X264_CSP_I420 && info->width&1 )
                h->dst_csp = X264_CSP_I444;
            if( h->dst_csp == X264_CSP_I420 && info->height&1 )
                h->dst_csp = X264_CSP_I422;
        }
        else if( handle_opts( optlist, opts, info, h ) )
            return -1;
    }
    else
    {
        h->dst_csp    = param->i_csp;
        h->dst.width  = param->i_width;
        h->dst.height = param->i_height;
    }
    h->ctx_flags = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) );
    x264_free_string_array( opts );

    if( h->ctx_flags != SWS_FAST_BILINEAR )
        h->ctx_flags |= SWS_FULL_CHR_H_INT | SWS_FULL_CHR_H_INP | SWS_ACCURATE_RND;
    h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp );
    h->scale = h->dst;

    /* swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 */
    int src_csp = info->csp & (X264_CSP_MASK | X264_CSP_OTHER);
    int dst_csp = h->dst_csp & (X264_CSP_MASK | X264_CSP_OTHER);
    h->pre_swap_chroma  = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV16 || src_csp == X264_CSP_YV24;
    h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV16 || dst_csp == X264_CSP_YV24;

    int src_pix_fmt = convert_csp_to_pix_fmt( info->csp );

    int src_pix_fmt_inv = convert_csp_to_pix_fmt( info->csp ^ X264_CSP_HIGH_DEPTH );
    int dst_pix_fmt_inv = convert_csp_to_pix_fmt( h->dst_csp ^ X264_CSP_HIGH_DEPTH );

    /* confirm swscale can support this conversion */
    FAIL_IF_ERROR( src_pix_fmt == PIX_FMT_NONE && src_pix_fmt_inv != PIX_FMT_NONE,
                   "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( src_pix_fmt_inv ),
                   info->csp & X264_CSP_HIGH_DEPTH ? 16 : 8 );
    FAIL_IF_ERROR( !sws_isSupportedInput( src_pix_fmt ), "input colorspace %s is not supported\n", av_get_pix_fmt_name( src_pix_fmt ) )
    FAIL_IF_ERROR( h->dst.pix_fmt == PIX_FMT_NONE && dst_pix_fmt_inv != PIX_FMT_NONE,
                   "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( dst_pix_fmt_inv ),
                   h->dst_csp & X264_CSP_HIGH_DEPTH ? 16 : 8 );
    FAIL_IF_ERROR( !sws_isSupportedOutput( h->dst.pix_fmt ), "output colorspace %s is not supported\n", av_get_pix_fmt_name( h->dst.pix_fmt ) )
    FAIL_IF_ERROR( h->dst.height != info->height && info->interlaced,
                   "swscale is not compatible with interlaced vertical resizing\n" )
    /* confirm that the desired resolution meets the colorspace requirements */
    const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp );
    FAIL_IF_ERROR( h->dst.width % csp->mod_width || h->dst.height % csp->mod_height,
                   "resolution %dx%d is not compliant with colorspace %s\n", h->dst.width, h->dst.height, csp->name )

    if( h->dst.width != info->width || h->dst.height != info->height )
        x264_cli_log( NAME, X264_LOG_INFO, "resizing to %dx%d\n", h->dst.width, h->dst.height );
    if( h->dst.pix_fmt != src_pix_fmt )
        x264_cli_log( NAME, X264_LOG_WARNING, "converting from %s to %s\n",
                      av_get_pix_fmt_name( src_pix_fmt ), av_get_pix_fmt_name( h->dst.pix_fmt ) );
    h->dst_csp |= info->csp & X264_CSP_VFLIP; // preserve vflip

    /* if the input is not variable, initialize the context */
    if( !h->variable_input )
    {
        cli_pic_t input_pic = {{info->csp, info->width, info->height, 0}, 0};
        if( check_resizer( h, &input_pic ) )
            return -1;
    }

    /* finished initing, overwrite values */
    info->csp    = h->dst_csp;
    info->width  = h->dst.width;
    info->height = h->dst.height;

    h->prev_filter = *filter;
    h->prev_hnd = *handle;
    *handle = h;
    *filter = resize_filter;

    return 0;
}

static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
{
    resizer_hnd_t *h = handle;
    if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) )
        return -1;
    if( h->variable_input && check_resizer( h, output ) )
        return -1;
    h->working = 1;
    if( h->pre_swap_chroma )
        XCHG( uint8_t*, output->img.plane[1], output->img.plane[2] );
    if( h->ctx )
    {
        sws_scale( h->ctx, (const uint8_t* const*)output->img.plane, output->img.stride,
                   0, output->img.height, h->buffer.img.plane, h->buffer.img.stride );
        output->img = h->buffer.img; /* copy img data */
    }
    else
        output->img.csp = h->dst_csp;
    if( h->post_swap_chroma )
        XCHG( uint8_t*, output->img.plane[1], output->img.plane[2] );

    return 0;
}

static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
{
    resizer_hnd_t *h = handle;
    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
}

static void free_filter( hnd_t handle )
{
    resizer_hnd_t *h = handle;
    h->prev_filter.free( h->prev_hnd );
    if( h->ctx )
        sws_freeContext( h->ctx );
    if( h->buffer_allocated )
        x264_cli_pic_clean( &h->buffer );
    free( h );
}

#else /* no swscale */
static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
{
    int ret = 0;

    if( !opt_string )
        ret = full_check( info, param );
    else
    {
        if( !strcmp( opt_string, "normcsp" ) )
            ret = info->csp & X264_CSP_OTHER;
        else
            ret = -1;
    }

    /* pass if nothing needs to be done, otherwise fail */
    FAIL_IF_ERROR( ret, "not compiled with swscale support\n" )
    return 0;
}

#define help NULL
#define get_frame NULL
#define release_frame NULL
#define free_filter NULL
#define convert_csp_to_pix_fmt(x) (x & X264_CSP_MASK)

#endif

cli_vid_filter_t resize_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL };
x264-snapshot-20120103-2245-stable/filters/video/video.h0000644000175000001440000000556311700673342021737 0ustar  videolanusers/*****************************************************************************
 * video.h: video filters
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_FILTER_VIDEO_H
#define X264_FILTER_VIDEO_H

#include "input/input.h"
#include "filters/filters.h"

typedef struct cli_vid_filter_t cli_vid_filter_t;

struct cli_vid_filter_t
{
    /* name of the filter */
    const char *name;
    /* help: a short message on what the filter does and how to use it.
     * this should only be implemented by filters directly accessible by the user */
    void (*help)( int longhelp );
    /* init: initializes the filter given the input clip properties and parameter to adjust them as necessary
     * with the given options provided by the user.
     * returns 0 on success, nonzero on error. */
    int (*init)( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string );
    /* get_frame: given the storage for the output frame and desired frame number, generate the frame accordingly.
     * the image data returned by get_frame should be treated as const and not be altered.
     * returns 0 on success, nonzero on error. */
    int (*get_frame)( hnd_t handle, cli_pic_t *output, int frame );
    /* release_frame: frame is done being used and is signaled for cleanup.
     * returns 0 on succeess, nonzero on error. */
    int (*release_frame)( hnd_t handle, cli_pic_t *pic, int frame );
    /* free: run filter cleanup procedures. */
    void (*free)( hnd_t handle );
    /* next registered filter, unused by filters themselves */
    cli_vid_filter_t *next;
};

void x264_register_vid_filters( void );
void x264_vid_filter_help( int longhelp );
int  x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter,
                           video_info_t *info, x264_param_t *param, char *opt_string );

#endif
x264-snapshot-20120103-2245-stable/filters/video/video.c0000644000175000001440000000516511700673342021730 0ustar  videolanusers/*****************************************************************************
 * video.c: video filters
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "video.h"

static cli_vid_filter_t *first_filter = NULL;

static void register_vid_filter( cli_vid_filter_t *new_filter )
{
    cli_vid_filter_t *filter_i = first_filter;
    while( filter_i->next )
        filter_i = filter_i->next;
    filter_i->next = new_filter;
    new_filter->next = NULL;
}

#define REGISTER_VFILTER(name)\
{\
    extern cli_vid_filter_t name##_filter;\
    register_vid_filter( &name##_filter );\
}

void x264_register_vid_filters( void )
{
    extern cli_vid_filter_t source_filter;
    first_filter = &source_filter;
    REGISTER_VFILTER( cache );
    REGISTER_VFILTER( crop );
    REGISTER_VFILTER( fix_vfr_pts );
    REGISTER_VFILTER( resize );
    REGISTER_VFILTER( select_every );
    REGISTER_VFILTER( depth );
#if HAVE_GPL
#endif
}

int x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter,
                          video_info_t *info, x264_param_t *param, char *opt_string )
{
    cli_vid_filter_t *filter_i = first_filter;
    while( filter_i && strcasecmp( name, filter_i->name ) )
        filter_i = filter_i->next;
    FAIL_IF_ERR( !filter_i, "x264", "invalid filter `%s'\n", name );
    if( filter_i->init( handle, filter, info, param, opt_string ) )
        return -1;

    return 0;
}

void x264_vid_filter_help( int longhelp )
{
    for( cli_vid_filter_t *filter_i = first_filter; filter_i; filter_i = filter_i->next )
        if( filter_i->help )
            filter_i->help( longhelp );
}
x264-snapshot-20120103-2245-stable/filters/video/source.c0000644000175000001440000000532111700673342022114 0ustar  videolanusers/*****************************************************************************
 * source.c: source video filter
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "video.h"

/* This filter converts the demuxer API into the filtering API for video frames.
 * Backseeking is prohibited here as not all demuxers are capable of doing so. */

typedef struct
{
    cli_pic_t pic;
    hnd_t hin;
    int cur_frame;
} source_hnd_t;

cli_vid_filter_t source_filter;

static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
{
    source_hnd_t *h = calloc( 1, sizeof(source_hnd_t) );
    if( !h )
        return -1;
    h->cur_frame = -1;

    if( cli_input.picture_alloc( &h->pic, info->csp, info->width, info->height ) )
        return -1;

    h->hin = *handle;
    *handle = h;
    *filter = source_filter;

    return 0;
}

static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
{
    source_hnd_t *h = handle;
    /* do not allow requesting of frames from before the current position */
    if( frame <= h->cur_frame || cli_input.read_frame( &h->pic, h->hin, frame ) )
        return -1;
    h->cur_frame = frame;
    *output = h->pic;
    return 0;
}

static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
{
    source_hnd_t *h = handle;
    if( cli_input.release_frame && cli_input.release_frame( &h->pic, h->hin ) )
        return -1;
    return 0;
}

static void free_filter( hnd_t handle )
{
    source_hnd_t *h = handle;
    cli_input.picture_clean( &h->pic );
    cli_input.close_file( h->hin );
    free( h );
}

cli_vid_filter_t source_filter = { "source", NULL, init, get_frame, release_frame, free_filter, NULL };
x264-snapshot-20120103-2245-stable/filters/video/select_every.c0000644000175000001440000001227511700673342023313 0ustar  videolanusers/*****************************************************************************
 * select_every.c: select-every video filter
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "video.h"
#define NAME "select_every"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )

#define MAX_PATTERN_SIZE 100 /* arbitrary */

typedef struct
{
    hnd_t prev_hnd;
    cli_vid_filter_t prev_filter;

    int *pattern;
    int pattern_len;
    int step_size;
    int vfr;
    int64_t pts;
} selvry_hnd_t;

cli_vid_filter_t select_every_filter;

static void help( int longhelp )
{
    printf( "      "NAME":step,offset1[,...]\n" );
    if( !longhelp )
        return;
    printf( "            apply a selection pattern to input frames\n"
            "            step: the number of frames in the pattern\n"
            "            offsets: the offset into the step to select a frame\n"
            "            see: http://avisynth.org/mediawiki/Select#SelectEvery\n" );
}

static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
{
    selvry_hnd_t *h = malloc( sizeof(selvry_hnd_t) );
    if( !h )
        return -1;
    h->pattern_len = 0;
    h->step_size = 0;
    int offsets[MAX_PATTERN_SIZE];
    for( char *tok, *p = opt_string; (tok = strtok( p, "," )); p = NULL )
    {
        int val = x264_otoi( tok, -1 );
        if( p )
        {
            FAIL_IF_ERROR( val <= 0, "invalid step `%s'\n", tok )
            h->step_size = val;
            continue;
        }
        FAIL_IF_ERROR( val < 0 || val >= h->step_size, "invalid offset `%s'\n", tok )
        FAIL_IF_ERROR( h->pattern_len >= MAX_PATTERN_SIZE, "max pattern size %d reached\n", MAX_PATTERN_SIZE )
        offsets[h->pattern_len++] = val;
    }
    FAIL_IF_ERROR( !h->step_size, "no step size provided\n" )
    FAIL_IF_ERROR( !h->pattern_len, "no offsets supplied\n" )

    h->pattern = malloc( h->pattern_len * sizeof(int) );
    if( !h->pattern )
        return -1;
    memcpy( h->pattern, offsets, h->pattern_len * sizeof(int) );

    /* determine required cache size to maintain pattern. */
    intptr_t max_rewind = 0;
    int min = h->step_size;
    for( int i = h->pattern_len-1; i >= 0; i-- )
    {
         min = X264_MIN( min, offsets[i] );
         if( i )
             max_rewind = X264_MAX( max_rewind, offsets[i-1] - min + 1 );
         /* reached maximum rewind size */
         if( max_rewind == h->step_size )
             break;
    }
    if( x264_init_vid_filter( "cache", handle, filter, info, param, (void*)max_rewind ) )
        return -1;

    /* done initing, overwrite properties */
    if( h->step_size != h->pattern_len )
    {
        info->num_frames = (uint64_t)info->num_frames * h->pattern_len / h->step_size;
        info->fps_den *= h->step_size;
        info->fps_num *= h->pattern_len;
        x264_reduce_fraction( &info->fps_num, &info->fps_den );
        if( info->vfr )
        {
            info->timebase_den *= h->pattern_len;
            info->timebase_num *= h->step_size;
            x264_reduce_fraction( &info->timebase_num, &info->timebase_den );
        }
    }

    h->pts = 0;
    h->vfr = info->vfr;
    h->prev_filter = *filter;
    h->prev_hnd = *handle;
    *filter = select_every_filter;
    *handle = h;

    return 0;
}

static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
{
    selvry_hnd_t *h = handle;
    int pat_frame = h->pattern[frame % h->pattern_len] + frame / h->pattern_len * h->step_size;
    if( h->prev_filter.get_frame( h->prev_hnd, output, pat_frame ) )
        return -1;
    if( h->vfr )
    {
        output->pts = h->pts;
        h->pts += output->duration;
    }
    return 0;
}

static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
{
    selvry_hnd_t *h = handle;
    int pat_frame = h->pattern[frame % h->pattern_len] + frame / h->pattern_len * h->step_size;
    return h->prev_filter.release_frame( h->prev_hnd, pic, pat_frame );
}

static void free_filter( hnd_t handle )
{
    selvry_hnd_t *h = handle;
    h->prev_filter.free( h->prev_hnd );
    free( h->pattern );
    free( h );
}

cli_vid_filter_t select_every_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL };
x264-snapshot-20120103-2245-stable/filters/video/internal.h0000644000175000001440000000267711700673342022450 0ustar  videolanusers/*****************************************************************************
 * internal.h: video filter utilities
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_FILTER_VIDEO_INTERNAL_H
#define X264_FILTER_VIDEO_INTERNAL_H
#include "video.h"

void x264_cli_plane_copy( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h );
int  x264_cli_pic_copy( cli_pic_t *out, cli_pic_t *in );

#endif
x264-snapshot-20120103-2245-stable/filters/video/internal.c0000644000175000001440000000457111700673342022436 0ustar  videolanusers/*****************************************************************************
 * internal.c: video filter utilities
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "internal.h"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "x264", __VA_ARGS__ )

void x264_cli_plane_copy( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h )
{
    while( h-- )
    {
        memcpy( dst, src, w );
        dst += i_dst;
        src += i_src;
    }
}

int x264_cli_pic_copy( cli_pic_t *out, cli_pic_t *in )
{
    int csp = in->img.csp & X264_CSP_MASK;
    FAIL_IF_ERROR( x264_cli_csp_is_invalid( in->img.csp ), "invalid colorspace arg %d\n", in->img.csp )
    FAIL_IF_ERROR( in->img.csp != out->img.csp || in->img.height != out->img.height
                || in->img.width != out->img.width, "incompatible frame properties\n" );
    /* copy data */
    out->duration = in->duration;
    out->pts = in->pts;
    out->opaque = in->opaque;

    for( int i = 0; i < out->img.planes; i++ )
    {
        int height = in->img.height * x264_cli_csps[csp].height[i];
        int width =  in->img.width  * x264_cli_csps[csp].width[i];
        width *= x264_cli_csp_depth_factor( in->img.csp );
        x264_cli_plane_copy( out->img.plane[i], out->img.stride[i], in->img.plane[i],
                             in->img.stride[i], width, height );
    }
    return 0;
}
x264-snapshot-20120103-2245-stable/filters/video/fix_vfr_pts.c0000644000175000001440000001134511700673342023150 0ustar  videolanusers/*****************************************************************************
 * fix_vfr_pts.c: vfr pts fixing video filter
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "video.h"
#include "internal.h"

/* This filter calculates and store the frame's duration to the frame data
 * (if it is not already calculated when the frame arrives to this point)
 * so it can be used by filters that will need to reconstruct pts due to
 * out-of-order frame requests */

typedef struct
{
    hnd_t prev_hnd;
    cli_vid_filter_t prev_filter;

    /* we need 1 buffer picture and 1 place holder */
    cli_pic_t buffer;
    cli_pic_t holder;
    int buffer_allocated;
    int holder_frame;
    int holder_ret;
    int64_t pts;
    int64_t last_duration;
} fix_vfr_pts_hnd_t;

cli_vid_filter_t fix_vfr_pts_filter;

static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
{
    /* if the input is not vfr, we don't do anything */
    if( !info->vfr )
        return 0;
    fix_vfr_pts_hnd_t *h = calloc( 1, sizeof(fix_vfr_pts_hnd_t) );
    if( !h )
        return -1;

    h->holder_frame = -1;
    h->prev_hnd = *handle;
    h->prev_filter = *filter;
    *handle = h;
    *filter = fix_vfr_pts_filter;

    return 0;
}

static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
{
    fix_vfr_pts_hnd_t *h = handle;
    /* if we want the holder picture and it errored, return the error. */
    if( frame == h->holder_frame )
    {
        if( h->holder_ret )
            return h->holder_ret;
    }
    else
    {
        /* if we have a holder frame and we don't want it, release the frame */
        if( h->holder_frame > 0 && h->holder_frame < frame && h->prev_filter.release_frame( h->prev_hnd, &h->holder, h->holder_frame ) )
            return -1;
        h->holder_frame = -1;
        if( h->prev_filter.get_frame( h->prev_hnd, &h->holder, frame ) )
            return -1;
    }

    /* if the frame's duration is not set already, read the next frame to set it. */
    if( !h->holder.duration )
    {
        /* allocate a buffer picture if we didn't already */
        if( !h->buffer_allocated )
        {
            if( x264_cli_pic_alloc( &h->buffer, h->holder.img.csp, h->holder.img.width, h->holder.img.height ) )
                return -1;
            h->buffer_allocated = 1;
        }
        h->holder_frame = frame+1;
        /* copy the current frame to the buffer, release it, and then read in the next frame to the placeholder */
        if( x264_cli_pic_copy( &h->buffer, &h->holder ) || h->prev_filter.release_frame( h->prev_hnd, &h->holder, frame ) )
            return -1;
        h->holder_ret = h->prev_filter.get_frame( h->prev_hnd, &h->holder, h->holder_frame );
        /* suppress non-monotonic pts warnings by setting the duration to be at least 1 */
        if( !h->holder_ret )
            h->last_duration = X264_MAX( h->holder.pts - h->buffer.pts, 1 );
        h->buffer.duration = h->last_duration;
        *output = h->buffer;
    }
    else
        *output = h->holder;

    output->pts = h->pts;
    h->pts += output->duration;

    return 0;
}

static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
{
    fix_vfr_pts_hnd_t *h = handle;
    /* if the frame is the buffered one, it's already been released */
    if( frame == (h->holder_frame - 1) )
        return 0;
    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
}

static void free_filter( hnd_t handle )
{
    fix_vfr_pts_hnd_t *h = handle;
    h->prev_filter.free( h->prev_hnd );
    if( h->buffer_allocated )
        x264_cli_pic_clean( &h->buffer );
    free( h );
}

cli_vid_filter_t fix_vfr_pts_filter = { "fix_vfr_pts", NULL, init, get_frame, release_frame, free_filter, NULL };
x264-snapshot-20120103-2245-stable/filters/video/depth.c0000644000175000001440000001767211700673342021734 0ustar  videolanusers/*****************************************************************************
 * depth.c: bit-depth conversion video filter
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Oskar Arvidsson <oskar@irock.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "video.h"
#define NAME "depth"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )

cli_vid_filter_t depth_filter;

typedef struct
{
    hnd_t prev_hnd;
    cli_vid_filter_t prev_filter;

    int bit_depth;
    int dst_csp;
    cli_pic_t buffer;
    int16_t *error_buf;
} depth_hnd_t;

static int depth_filter_csp_is_supported( int csp )
{
    int csp_mask = csp & X264_CSP_MASK;
    return csp_mask == X264_CSP_I420 ||
           csp_mask == X264_CSP_I422 ||
           csp_mask == X264_CSP_I444 ||
           csp_mask == X264_CSP_YV12 ||
           csp_mask == X264_CSP_YV16 ||
           csp_mask == X264_CSP_YV24 ||
           csp_mask == X264_CSP_NV12 ||
           csp_mask == X264_CSP_NV16;
}

static int csp_num_interleaved( int csp, int plane )
{
    int csp_mask = csp & X264_CSP_MASK;
    return ( (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ) ? 2 : 1;
}

/* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been
 * written in such a way so that if the source has been upconverted using the
 * same algorithm as used in scale_image, dithering down to the source bit
 * depth again is lossless. */
#define DITHER_PLANE( pitch ) \
static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int src_stride, \
                                        int width, int height, int16_t *errors ) \
{ \
    const int lshift = 16-BIT_DEPTH; \
    const int rshift = 2*BIT_DEPTH-16; \
    const int pixel_max = (1 << BIT_DEPTH)-1; \
    const int half = 1 << (16-BIT_DEPTH); \
    memset( errors, 0, (width+1) * sizeof(int16_t) ); \
    for( int y = 0; y < height; y++, src += src_stride, dst += dst_stride ) \
    { \
        int err = 0; \
        for( int x = 0; x < width; x++ ) \
        { \
            err = err*2 + errors[x] + errors[x+1]; \
            dst[x*pitch] = x264_clip3( (((src[x*pitch]+half)<<2)+err)*pixel_max >> 18, 0, pixel_max ); \
            errors[x] = err = src[x*pitch] - (dst[x*pitch] << lshift) - (dst[x*pitch] >> rshift); \
        } \
    } \
}

DITHER_PLANE( 1 )
DITHER_PLANE( 2 )

static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf )
{
    int csp_mask = img->csp & X264_CSP_MASK;
    for( int i = 0; i < img->planes; i++ )
    {
        int num_interleaved = csp_num_interleaved( img->csp, i );
        int height = x264_cli_csps[csp_mask].height[i] * img->height;
        int width = x264_cli_csps[csp_mask].width[i] * img->width / num_interleaved;

#define CALL_DITHER_PLANE( pitch, off ) \
        dither_plane_##pitch( ((pixel*)out->plane[i])+off, out->stride[i]/sizeof(pixel), \
                ((uint16_t*)img->plane[i])+off, img->stride[i]/2, width, height, error_buf )

        if( num_interleaved == 1 )
        {
            CALL_DITHER_PLANE( 1, 0 );
        }
        else
        {
            CALL_DITHER_PLANE( 2, 0 );
            CALL_DITHER_PLANE( 2, 1 );
        }
    }
}

static void scale_image( cli_image_t *output, cli_image_t *img )
{
    /* this function mimics how swscale does upconversion. 8-bit is converted
     * to 16-bit through left shifting the orginal value with 8 and then adding
     * the original value to that. This effectively keeps the full color range
     * while also being fast. for n-bit we basically do the same thing, but we
     * discard the lower 16-n bits. */
    int csp_mask = img->csp & X264_CSP_MASK;
    const int shift = 16-BIT_DEPTH;
    for( int i = 0; i < img->planes; i++ )
    {
        uint8_t *src = img->plane[i];
        uint16_t *dst = (uint16_t*)output->plane[i];
        int height = x264_cli_csps[csp_mask].height[i] * img->height;
        int width = x264_cli_csps[csp_mask].width[i] * img->width;

        for( int j = 0; j < height; j++ )
        {
            for( int k = 0; k < width; k++ )
                dst[k] = ((src[k] << 8) + src[k]) >> shift;

            src += img->stride[i];
            dst += output->stride[i]/2;
        }
    }
}

static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
{
    depth_hnd_t *h = handle;

    if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) )
        return -1;

    if( h->bit_depth < 16 && output->img.csp & X264_CSP_HIGH_DEPTH )
    {
        dither_image( &h->buffer.img, &output->img, h->error_buf );
        output->img = h->buffer.img;
    }
    else if( h->bit_depth > 8 && !(output->img.csp & X264_CSP_HIGH_DEPTH) )
    {
        scale_image( &h->buffer.img, &output->img );
        output->img = h->buffer.img;
    }
    return 0;
}

static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
{
    depth_hnd_t *h = handle;
    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
}

static void free_filter( hnd_t handle )
{
    depth_hnd_t *h = handle;
    h->prev_filter.free( h->prev_hnd );
    x264_cli_pic_clean( &h->buffer );
    x264_free( h );
}

static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info,
                 x264_param_t *param, char *opt_string )
{
    int ret = 0;
    int change_fmt = (info->csp ^ param->i_csp) & X264_CSP_HIGH_DEPTH;
    int csp = ~(~info->csp ^ change_fmt);
    int bit_depth = 8*x264_cli_csp_depth_factor( csp );

    if( opt_string )
    {
        static const char *optlist[] = { "bit_depth", NULL };
        char **opts = x264_split_options( opt_string, optlist );

        if( opts )
        {
            char *str_bit_depth = x264_get_option( "bit_depth", opts );
            bit_depth = x264_otoi( str_bit_depth, -1 );

            ret = bit_depth < 8 || bit_depth > 16;
            csp = bit_depth > 8 ? csp | X264_CSP_HIGH_DEPTH : csp & ~X264_CSP_HIGH_DEPTH;
            change_fmt = (info->csp ^ csp) & X264_CSP_HIGH_DEPTH;
            x264_free_string_array( opts );
        }
        else
            ret = 1;
    }

    FAIL_IF_ERROR( bit_depth != BIT_DEPTH, "this build supports only bit depth %d\n", BIT_DEPTH )
    FAIL_IF_ERROR( ret, "unsupported bit depth conversion.\n" )

    /* only add the filter to the chain if it's needed */
    if( change_fmt || bit_depth != 8 * x264_cli_csp_depth_factor( csp ) )
    {
        FAIL_IF_ERROR( !depth_filter_csp_is_supported(csp), "unsupported colorspace.\n" )
        depth_hnd_t *h = x264_malloc( sizeof(depth_hnd_t) + (info->width+1)*sizeof(int16_t) );

        if( !h )
            return -1;

        h->error_buf = (int16_t*)(h + 1);
        h->dst_csp = csp;
        h->bit_depth = bit_depth;
        h->prev_hnd = *handle;
        h->prev_filter = *filter;

        if( x264_cli_pic_alloc( &h->buffer, h->dst_csp, info->width, info->height ) )
        {
            x264_free( h );
            return -1;
        }

        *handle = h;
        *filter = depth_filter;
        info->csp = h->dst_csp;
    }

    return 0;
}

cli_vid_filter_t depth_filter = { NAME, NULL, init, get_frame, release_frame, free_filter, NULL };
x264-snapshot-20120103-2245-stable/filters/video/crop.c0000644000175000001440000001123311700673342021556 0ustar  videolanusers/*****************************************************************************
 * crop.c: crop video filter
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *          James Darnley <james.darnley@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "video.h"
#define NAME "crop"
#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )

cli_vid_filter_t crop_filter;

typedef struct
{
    hnd_t prev_hnd;
    cli_vid_filter_t prev_filter;

    int dims[4]; /* left, top, width, height */
    const x264_cli_csp_t *csp;
} crop_hnd_t;

static void help( int longhelp )
{
    printf( "      "NAME":left,top,right,bottom\n" );
    if( !longhelp )
        return;
    printf( "            removes pixels from the edges of the frame\n" );
}

static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
{
    FAIL_IF_ERROR( x264_cli_csp_is_invalid( info->csp ), "invalid csp %d\n", info->csp )
    crop_hnd_t *h = calloc( 1, sizeof(crop_hnd_t) );
    if( !h )
        return -1;

    h->csp = x264_cli_get_csp( info->csp );
    static const char *optlist[] = { "left", "top", "right", "bottom", NULL };
    char **opts = x264_split_options( opt_string, optlist );
    if( !opts )
        return -1;

    for( int i = 0; i < 4; i++ )
    {
        char *opt = x264_get_option( optlist[i], opts );
        FAIL_IF_ERROR( !opt, "%s crop value not specified\n", optlist[i] )
        h->dims[i] = x264_otoi( opt, -1 );
        FAIL_IF_ERROR( h->dims[i] < 0, "%s crop value `%s' is less than 0\n", optlist[i], opt )
        int dim_mod = i&1 ? (h->csp->mod_height << info->interlaced) : h->csp->mod_width;
        FAIL_IF_ERROR( h->dims[i] % dim_mod, "%s crop value `%s' is not a multiple of %d\n", optlist[i], opt, dim_mod )
    }
    x264_free_string_array( opts );
    h->dims[2] = info->width  - h->dims[0] - h->dims[2];
    h->dims[3] = info->height - h->dims[1] - h->dims[3];
    FAIL_IF_ERROR( h->dims[2] <= 0 || h->dims[3] <= 0, "invalid output resolution %dx%d\n", h->dims[2], h->dims[3] )

    if( info->width != h->dims[2] || info->height != h->dims[3] )
        x264_cli_log( NAME, X264_LOG_INFO, "cropping to %dx%d\n", h->dims[2], h->dims[3] );
    else
    {
        /* do nothing as the user supplied 0s for all the values */
        free( h );
        return 0;
    }
    /* done initializing, overwrite values */
    info->width  = h->dims[2];
    info->height = h->dims[3];

    h->prev_filter = *filter;
    h->prev_hnd = *handle;
    *handle = h;
    *filter = crop_filter;

    return 0;
}

static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
{
    crop_hnd_t *h = handle;
    if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) )
        return -1;
    output->img.width  = h->dims[2];
    output->img.height = h->dims[3];
    /* shift the plane pointers down 'top' rows and right 'left' columns. */
    for( int i = 0; i < output->img.planes; i++ )
    {
        intptr_t offset = output->img.stride[i] * h->dims[1] * h->csp->height[i];
        offset += h->dims[0] * h->csp->width[i];
        offset *= x264_cli_csp_depth_factor( output->img.csp );
        output->img.plane[i] += offset;
    }
    return 0;
}

static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
{
    crop_hnd_t *h = handle;
    /* NO filter should ever have a dependent release based on the plane pointers,
     * so avoid unnecessary unshifting */
    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
}

static void free_filter( hnd_t handle )
{
    crop_hnd_t *h = handle;
    h->prev_filter.free( h->prev_hnd );
    free( h );
}

cli_vid_filter_t crop_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL };
x264-snapshot-20120103-2245-stable/filters/video/cache.c0000644000175000001440000001173311700673342021663 0ustar  videolanusers/*****************************************************************************
 * cache.c: cache video filter
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "video.h"
#include "internal.h"
#define NAME "cache"
#define LAST_FRAME (h->first_frame + h->cur_size - 1)

typedef struct
{
    hnd_t prev_hnd;
    cli_vid_filter_t prev_filter;

    int max_size;
    int first_frame; /* first cached frame */
    cli_pic_t **cache;
    int cur_size;
    int eof;         /* frame beyond end of the file */
} cache_hnd_t;

cli_vid_filter_t cache_filter;

static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
{
    intptr_t size = (intptr_t)opt_string;
    /* upon a <= 0 cache request, do nothing */
    if( size <= 0 )
        return 0;
    cache_hnd_t *h = calloc( 1, sizeof(cache_hnd_t) );
    if( !h )
        return -1;

    h->max_size = size;
    h->cache = malloc( (h->max_size+1) * sizeof(cli_pic_t*) );
    if( !h->cache )
        return -1;

    for( int i = 0; i < h->max_size; i++ )
    {
        h->cache[i] = malloc( sizeof(cli_pic_t) );
        if( !h->cache[i] || x264_cli_pic_alloc( h->cache[i], info->csp, info->width, info->height ) )
            return -1;
    }
    h->cache[h->max_size] = NULL; /* require null terminator for list methods */

    h->prev_filter = *filter;
    h->prev_hnd = *handle;
    *handle = h;
    *filter = cache_filter;

    return 0;
}

static void fill_cache( cache_hnd_t *h, int frame )
{
    /* shift frames out of the cache as the frame request is beyond the filled cache */
    int shift = frame - LAST_FRAME;
    /* no frames to shift or no frames left to read */
    if( shift <= 0 || h->eof )
        return;
    /* the next frames to read are either
     * A) starting at the end of the current cache, or
     * B) starting at a new frame that has the end of the cache at the desired frame
     * and proceeding to fill the entire cache */
    int cur_frame = X264_MAX( h->first_frame + h->cur_size, frame - h->max_size + 1 );
    /* the new starting point is either
     * A) the current one shifted the number of frames entering/leaving the cache, or
     * B) at a new frame that has the end of the cache at the desired frame. */
    h->first_frame = X264_MIN( h->first_frame + shift, cur_frame );
    h->cur_size = X264_MAX( h->cur_size - shift, 0 );
    while( h->cur_size < h->max_size )
    {
        cli_pic_t temp;
        /* the old front frame is going to shift off, overwrite it with the new frame */
        cli_pic_t *cache = h->cache[0];
        if( h->prev_filter.get_frame( h->prev_hnd, &temp, cur_frame ) ||
            x264_cli_pic_copy( cache, &temp ) ||
            h->prev_filter.release_frame( h->prev_hnd, &temp, cur_frame ) )
        {
            h->eof = cur_frame;
            return;
        }
        /* the read was successful, shift the frame off the front to the end */
        x264_frame_push( (void*)h->cache, x264_frame_shift( (void*)h->cache ) );
        cur_frame++;
        h->cur_size++;
    }
}

static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
{
    cache_hnd_t *h = handle;
    FAIL_IF_ERR( frame < h->first_frame, NAME, "frame %d is before first cached frame %d \n", frame, h->first_frame );
    fill_cache( h, frame );
    if( frame > LAST_FRAME ) /* eof */
        return -1;
    int idx = frame - (h->eof ? h->eof - h->max_size : h->first_frame);
    *output = *h->cache[idx];
    return 0;
}

static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
{
    /* the parent filter's frame has already been released so do nothing here */
    return 0;
}

static void free_filter( hnd_t handle )
{
    cache_hnd_t *h = handle;
    h->prev_filter.free( h->prev_hnd );
    for( int i = 0; i < h->max_size; i++ )
    {
        x264_cli_pic_clean( h->cache[i] );
        free( h->cache[i] );
    }
    free( h->cache );
    free( h );
}

cli_vid_filter_t cache_filter = { NAME, NULL, init, get_frame, release_frame, free_filter, NULL };
x264-snapshot-20120103-2245-stable/filters/filters.h0000644000175000001440000000353411700673342021167 0ustar  videolanusers/*****************************************************************************
 * filters.h: common filter functions
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Diogo Franco <diogomfranco@gmail.com>
 *          Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_FILTERS_H
#define X264_FILTERS_H

#include "x264cli.h"
#include "filters/video/video.h"

char **x264_split_string( char *string, char *sep, int limit );
void   x264_free_string_array( char **array );

char **x264_split_options( const char *opt_str, const char *options[] );
char  *x264_get_option( const char *name, char **split_options );
int    x264_otob( char *str, int def );    // option to bool
double x264_otof( char *str, double def ); // option to float/double
int    x264_otoi( char *str, int def );    // option to int
char  *x264_otos( char *str, char *def );  // option to string

#endif
x264-snapshot-20120103-2245-stable/filters/filters.c0000644000175000001440000001413111700673342021155 0ustar  videolanusers/*****************************************************************************
 * filters.c: common filter functions
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Diogo Franco <diogomfranco@gmail.com>
 *          Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "filters.h"
#define RETURN_IF_ERROR( cond, ... ) RETURN_IF_ERR( cond, "options", NULL, __VA_ARGS__ )

char **x264_split_string( char *string, char *sep, int limit )
{
    if( !string )
        return NULL;
    int sep_count = 0;
    char *tmp = string;
    while( ( tmp = ( tmp = strstr( tmp, sep ) ) ? tmp + strlen( sep ) : 0 ) )
        ++sep_count;
    if( sep_count == 0 )
    {
        if( string[0] == '\0' )
            return calloc( 1, sizeof( char** ) );
        char **ret = calloc( 2, sizeof( char** ) );
        ret[0] = strdup( string );
        return ret;
    }

    char **split = calloc( ( limit > 0 ? limit : sep_count ) + 2, sizeof(char**) );
    int i = 0;
    char *str = strdup( string );
    assert( str );
    char *esc = NULL;
    char *tok = str, *nexttok = str;
    do
    {
        nexttok = strstr( nexttok, sep );
        if( nexttok )
            *nexttok++ = '\0';
        if( ( limit > 0 && i >= limit ) ||
            ( i > 0 && ( ( esc = strrchr( split[i-1], '\\' ) ) ? esc[1] == '\0' : 0 ) ) ) // Allow escaping
        {
            int j = i-1;
            if( esc )
                esc[0] = '\0';
            split[j] = realloc( split[j], strlen( split[j] ) + strlen( sep ) + strlen( tok ) + 1 );
            assert( split[j] );
            strcat( split[j], sep );
            strcat( split[j], tok );
            esc = NULL;
        }
        else
        {
            split[i++] = strdup( tok );
            assert( split[i-1] );
        }
        tok = nexttok;
    } while ( tok );
    free( str );
    assert( !split[i] );

    return split;
}

void x264_free_string_array( char **array )
{
    if( !array )
        return;
    for( int i = 0; array[i] != NULL; i++ )
        free( array[i] );
    free( array );
}

char **x264_split_options( const char *opt_str, const char *options[] )
{
    if( !opt_str )
        return NULL;
    char *opt_str_dup = strdup( opt_str );
    char **split = x264_split_string( opt_str_dup, ",", 0 );
    free( opt_str_dup );
    int split_count = 0;
    while( split[split_count] != NULL )
        ++split_count;

    int options_count = 0;
    while( options[options_count] != NULL )
        ++options_count;

    char **opts = calloc( split_count * 2 + 2, sizeof( char ** ) );
    char **arg = NULL;
    int opt = 0, found_named = 0, invalid = 0;
    for( int i = 0; split[i] != NULL; i++, invalid = 0 )
    {
        arg = x264_split_string( split[i], "=", 2 );
        if( arg == NULL )
        {
            if( found_named )
                invalid = 1;
            else RETURN_IF_ERROR( i > options_count || options[i] == NULL, "Too many options given\n" )
            else
            {
                opts[opt++] = strdup( options[i] );
                opts[opt++] = strdup( "" );
            }
        }
        else if( arg[0] == NULL || arg[1] == NULL )
        {
            if( found_named )
                invalid = 1;
            else RETURN_IF_ERROR( i > options_count || options[i] == NULL, "Too many options given\n" )
            else
            {
                opts[opt++] = strdup( options[i] );
                if( arg[0] )
                    opts[opt++] = strdup( arg[0] );
                else
                    opts[opt++] = strdup( "" );
            }
        }
        else
        {
            found_named = 1;
            int j = 0;
            while( options[j] != NULL && strcmp( arg[0], options[j] ) )
                ++j;
            RETURN_IF_ERROR( options[j] == NULL, "Invalid option '%s'\n", arg[0] )
            else
            {
                opts[opt++] = strdup( arg[0] );
                opts[opt++] = strdup( arg[1] );
            }
        }
        RETURN_IF_ERROR( invalid, "Ordered option given after named\n" )
        x264_free_string_array( arg );
    }
    x264_free_string_array( split );
    return opts;
}

char *x264_get_option( const char *name, char **split_options )
{
    if( !split_options )
        return NULL;
    int last_i = -1;
    for( int i = 0; split_options[i] != NULL; i += 2 )
        if( !strcmp( split_options[i], name ) )
            last_i = i;
    if( last_i >= 0 )
        return split_options[last_i+1][0] ? split_options[last_i+1] : NULL;
    return NULL;
}

int x264_otob( char *str, int def )
{
   int ret = def;
   if( str )
       ret = !strcasecmp( str, "true" ) ||
             !strcmp( str, "1" ) ||
             !strcasecmp( str, "yes" );
   return ret;
}

double x264_otof( char *str, double def )
{
   double ret = def;
   if( str )
   {
       char *end;
       ret = strtod( str, &end );
       if( end == str || *end != '\0' )
           ret = def;
   }
   return ret;
}

int x264_otoi( char *str, int def )
{
    int ret = def;
    if( str )
    {
        char *end;
        ret = strtol( str, &end, 0 );
        if( end == str || *end != '\0' )
            ret = def;
    }
    return ret;
}

char *x264_otos( char *str, char *def )
{
    return str ? str : def;
}
x264-snapshot-20120103-2245-stable/extras/0000755000175000001440000000000011700673342017177 5ustar  videolanusersx264-snapshot-20120103-2245-stable/extras/stdint.h0000644000175000001440000001407611700673342020665 0ustar  videolanusers/* ISO C9x  7.18  Integer types <stdint.h>
 * Based on ISO/IEC SC22/WG14 9899 Committee draft (SC22 N2794)
 *
 *  THIS SOFTWARE IS NOT COPYRIGHTED
 *
 *  Contributor: Danny Smith <danny_r_smith_2001@yahoo.co.nz>
 *
 *  This source code is offered for use in the public domain. You may
 *  use, modify or distribute it freely.
 *
 *  This code is distributed in the hope that it will be useful but
 *  WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
 *  DISCLAIMED. This includes but is not limited to warranties of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 *  Date: 2000-12-02
 */


#ifndef _STDINT_H
#define _STDINT_H
#define __need_wint_t
#define __need_wchar_t
#include <stddef.h>

/* 7.18.1.1  Exact-width integer types */
typedef signed char int8_t;
typedef unsigned char   uint8_t;
typedef short  int16_t;
typedef unsigned short  uint16_t;
typedef int  int32_t;
typedef unsigned   uint32_t;
typedef __int64  int64_t;
typedef unsigned __int64 uint64_t;

/* 7.18.1.2  Minimum-width integer types */
typedef signed char int_least8_t;
typedef unsigned char   uint_least8_t;
typedef short  int_least16_t;
typedef unsigned short  uint_least16_t;
typedef int  int_least32_t;
typedef unsigned   uint_least32_t;
typedef __int64  int_least64_t;
typedef unsigned __int64   uint_least64_t;

/*  7.18.1.3  Fastest minimum-width integer types
 *  Not actually guaranteed to be fastest for all purposes
 *  Here we use the exact-width types for 8 and 16-bit ints.
 */
typedef char int_fast8_t;
typedef unsigned char uint_fast8_t;
typedef short  int_fast16_t;
typedef unsigned short  uint_fast16_t;
typedef int  int_fast32_t;
typedef unsigned  int  uint_fast32_t;
typedef __int64  int_fast64_t;
typedef unsigned __int64   uint_fast64_t;

/* 7.18.1.4  Integer types capable of holding object pointers */
/*typedef int intptr_t;
typedef unsigned uintptr_t;*/

/* 7.18.1.5  Greatest-width integer types */
typedef __int64  intmax_t;
typedef unsigned __int64   uintmax_t;

/* 7.18.2  Limits of specified-width integer types */
#if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS)

/* 7.18.2.1  Limits of exact-width integer types */
#define INT8_MIN (-128)
#define INT16_MIN (-32768)
#define INT32_MIN (-2147483647 - 1)
#define INT64_MIN  (-9223372036854775807LL - 1)

#define INT8_MAX 127
#define INT16_MAX 32767
#define INT32_MAX 2147483647
#define INT64_MAX 9223372036854775807LL

#define UINT8_MAX 0xff /* 255U */
#define UINT16_MAX 0xffff /* 65535U */
#define UINT32_MAX 0xffffffff  /* 4294967295U */
#define UINT64_MAX 0xffffffffffffffffULL /* 18446744073709551615ULL */

/* 7.18.2.2  Limits of minimum-width integer types */
#define INT_LEAST8_MIN INT8_MIN
#define INT_LEAST16_MIN INT16_MIN
#define INT_LEAST32_MIN INT32_MIN
#define INT_LEAST64_MIN INT64_MIN

#define INT_LEAST8_MAX INT8_MAX
#define INT_LEAST16_MAX INT16_MAX
#define INT_LEAST32_MAX INT32_MAX
#define INT_LEAST64_MAX INT64_MAX

#define UINT_LEAST8_MAX UINT8_MAX
#define UINT_LEAST16_MAX UINT16_MAX
#define UINT_LEAST32_MAX UINT32_MAX
#define UINT_LEAST64_MAX UINT64_MAX

/* 7.18.2.3  Limits of fastest minimum-width integer types */
#define INT_FAST8_MIN INT8_MIN
#define INT_FAST16_MIN INT16_MIN
#define INT_FAST32_MIN INT32_MIN
#define INT_FAST64_MIN INT64_MIN

#define INT_FAST8_MAX INT8_MAX
#define INT_FAST16_MAX INT16_MAX
#define INT_FAST32_MAX INT32_MAX
#define INT_FAST64_MAX INT64_MAX

#define UINT_FAST8_MAX UINT8_MAX
#define UINT_FAST16_MAX UINT16_MAX
#define UINT_FAST32_MAX UINT32_MAX
#define UINT_FAST64_MAX UINT64_MAX

/* 7.18.2.4  Limits of integer types capable of holding
    object pointers */
#if defined(_WIN64) || defined(__LP64__)
#define INTPTR_MIN INT64_MIN
#define INTPTR_MAX INT64_MAX
#define UINTPTR_MAX UINT64_MAX
#else
#define INTPTR_MIN INT32_MIN
#define INTPTR_MAX INT32_MAX
#define UINTPTR_MAX UINT32_MAX
#endif

/* 7.18.2.5  Limits of greatest-width integer types */
#define INTMAX_MIN INT64_MIN
#define INTMAX_MAX INT64_MAX
#define UINTMAX_MAX UINT64_MAX

/* 7.18.3  Limits of other integer types */
#if defined(_WIN64) || defined(__LP64__)
#define PTRDIFF_MIN INT64_MIN
#define PTRDIFF_MAX INT64_MAX
#else
#define PTRDIFF_MIN INT32_MIN
#define PTRDIFF_MAX INT32_MAX
#endif

#define SIG_ATOMIC_MIN INT32_MIN
#define SIG_ATOMIC_MAX INT32_MAX

#ifndef SIZE_MAX
#if defined(_WIN64) || defined(__LP64__)
#define SIZE_MAX UINT64_MAX
#else
#define SIZE_MAX UINT32_MAX
#endif
#endif

#ifndef WCHAR_MIN  /* also in wchar.h */
#define WCHAR_MIN 0
#define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */
#endif

/*
 * wint_t is unsigned short for compatibility with MS runtime
 */
#define WINT_MIN 0
#define WINT_MAX ((wint_t)-1) /* UINT16_MAX */

#endif /* !defined ( __cplusplus) || defined __STDC_LIMIT_MACROS */


/* 7.18.4  Macros for integer constants */
#if !defined ( __cplusplus) || defined (__STDC_CONSTANT_MACROS)

/* 7.18.4.1  Macros for minimum-width integer constants

    Accoding to Douglas Gwyn <gwyn@arl.mil>:
	"This spec was changed in ISO/IEC 9899:1999 TC1; in ISO/IEC
	9899:1999 as initially published, the expansion was required
	to be an integer constant of precisely matching type, which
	is impossible to accomplish for the shorter types on most
	platforms, because C99 provides no standard way to designate
	an integer constant with width less than that of type int.
	TC1 changed this to require just an integer constant
	*expression* with *promoted* type."

	The trick used here is from Clive D W Feather.
*/

#define INT8_C(val) (INT_LEAST8_MAX-INT_LEAST8_MAX+(val))
#define INT16_C(val) (INT_LEAST16_MAX-INT_LEAST16_MAX+(val))
#define INT32_C(val) (INT_LEAST32_MAX-INT_LEAST32_MAX+(val))
#define INT64_C(val) (INT_LEAST64_MAX-INT_LEAST64_MAX+(val))

#define UINT8_C(val) (UINT_LEAST8_MAX-UINT_LEAST8_MAX+(val))
#define UINT16_C(val) (UINT_LEAST16_MAX-UINT_LEAST16_MAX+(val))
#define UINT32_C(val) (UINT_LEAST32_MAX-UINT_LEAST32_MAX+(val))
#define UINT64_C(val) (UINT_LEAST64_MAX-UINT_LEAST64_MAX+(val))

/* 7.18.4.2  Macros for greatest-width integer constants */
#define INTMAX_C(val) (INTMAX_MAX-INTMAX_MAX+(val))
#define UINTMAX_C(val) (UINTMAX_MAX-UINTMAX_MAX+(val))

#endif  /* !defined ( __cplusplus) || defined __STDC_CONSTANT_MACROS */

#endif
x264-snapshot-20120103-2245-stable/extras/inttypes.h0000644000175000001440000001661111700673342021234 0ustar  videolanusers// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
//
//  Copyright (c) 2006 Alexander Chemeris
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//   1. Redistributions of source code must retain the above copyright notice,
//      this list of conditions and the following disclaimer.
//
//   2. Redistributions in binary form must reproduce the above copyright
//      notice, this list of conditions and the following disclaimer in the
//      documentation and/or other materials provided with the distribution.
//
//   3. The name of the author may be used to endorse or promote products
//      derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////////

#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]

#ifndef _MSC_INTTYPES_H_ // [
#define _MSC_INTTYPES_H_

#if _MSC_VER > 1000
#pragma once
#endif

#include "stdint.h"

// 7.8 Format conversion of integer types

typedef struct {
   intmax_t quot;
   intmax_t rem;
} imaxdiv_t;

// 7.8.1 Macros for format specifiers

#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198

// The fprintf macros for signed integers are:
#define PRId8       "d"
#define PRIi8       "i"
#define PRIdLEAST8  "d"
#define PRIiLEAST8  "i"
#define PRIdFAST8   "d"
#define PRIiFAST8   "i"

#define PRId16       "hd"
#define PRIi16       "hi"
#define PRIdLEAST16  "hd"
#define PRIiLEAST16  "hi"
#define PRIdFAST16   "hd"
#define PRIiFAST16   "hi"

#define PRId32       "I32d"
#define PRIi32       "I32i"
#define PRIdLEAST32  "I32d"
#define PRIiLEAST32  "I32i"
#define PRIdFAST32   "I32d"
#define PRIiFAST32   "I32i"

#define PRId64       "I64d"
#define PRIi64       "I64i"
#define PRIdLEAST64  "I64d"
#define PRIiLEAST64  "I64i"
#define PRIdFAST64   "I64d"
#define PRIiFAST64   "I64i"

#define PRIdMAX     "I64d"
#define PRIiMAX     "I64i"

#define PRIdPTR     "Id"
#define PRIiPTR     "Ii"

// The fprintf macros for unsigned integers are:
#define PRIo8       "o"
#define PRIu8       "u"
#define PRIx8       "x"
#define PRIX8       "X"
#define PRIoLEAST8  "o"
#define PRIuLEAST8  "u"
#define PRIxLEAST8  "x"
#define PRIXLEAST8  "X"
#define PRIoFAST8   "o"
#define PRIuFAST8   "u"
#define PRIxFAST8   "x"
#define PRIXFAST8   "X"

#define PRIo16       "ho"
#define PRIu16       "hu"
#define PRIx16       "hx"
#define PRIX16       "hX"
#define PRIoLEAST16  "ho"
#define PRIuLEAST16  "hu"
#define PRIxLEAST16  "hx"
#define PRIXLEAST16  "hX"
#define PRIoFAST16   "ho"
#define PRIuFAST16   "hu"
#define PRIxFAST16   "hx"
#define PRIXFAST16   "hX"

#define PRIo32       "I32o"
#define PRIu32       "I32u"
#define PRIx32       "I32x"
#define PRIX32       "I32X"
#define PRIoLEAST32  "I32o"
#define PRIuLEAST32  "I32u"
#define PRIxLEAST32  "I32x"
#define PRIXLEAST32  "I32X"
#define PRIoFAST32   "I32o"
#define PRIuFAST32   "I32u"
#define PRIxFAST32   "I32x"
#define PRIXFAST32   "I32X"

#define PRIo64       "I64o"
#define PRIu64       "I64u"
#define PRIx64       "I64x"
#define PRIX64       "I64X"
#define PRIoLEAST64  "I64o"
#define PRIuLEAST64  "I64u"
#define PRIxLEAST64  "I64x"
#define PRIXLEAST64  "I64X"
#define PRIoFAST64   "I64o"
#define PRIuFAST64   "I64u"
#define PRIxFAST64   "I64x"
#define PRIXFAST64   "I64X"

#define PRIoMAX     "I64o"
#define PRIuMAX     "I64u"
#define PRIxMAX     "I64x"
#define PRIXMAX     "I64X"

#define PRIoPTR     "Io"
#define PRIuPTR     "Iu"
#define PRIxPTR     "Ix"
#define PRIXPTR     "IX"

// The fscanf macros for signed integers are:
#define SCNd16       "hd"
#define SCNi16       "hi"
#define SCNdLEAST16  "hd"
#define SCNiLEAST16  "hi"
#define SCNdFAST16   "hd"
#define SCNiFAST16   "hi"

#define SCNd32       "ld"
#define SCNi32       "li"
#define SCNdLEAST32  "ld"
#define SCNiLEAST32  "li"
#define SCNdFAST32   "ld"
#define SCNiFAST32   "li"

#define SCNd64       "I64d"
#define SCNi64       "I64i"
#define SCNdLEAST64  "I64d"
#define SCNiLEAST64  "I64i"
#define SCNdFAST64   "I64d"
#define SCNiFAST64   "I64i"

#define SCNdMAX     "I64d"
#define SCNiMAX     "I64i"

#ifdef _WIN64 // [
#  define SCNdPTR     "I64d"
#  define SCNiPTR     "I64i"
#else  // _WIN64 ][
#  define SCNdPTR     "ld"
#  define SCNiPTR     "li"
#endif  // _WIN64 ]

// The fscanf macros for unsigned integers are:
#define SCNo16       "ho"
#define SCNu16       "hu"
#define SCNx16       "hx"
#define SCNX16       "hX"
#define SCNoLEAST16  "ho"
#define SCNuLEAST16  "hu"
#define SCNxLEAST16  "hx"
#define SCNXLEAST16  "hX"
#define SCNoFAST16   "ho"
#define SCNuFAST16   "hu"
#define SCNxFAST16   "hx"
#define SCNXFAST16   "hX"

#define SCNo32       "lo"
#define SCNu32       "lu"
#define SCNx32       "lx"
#define SCNX32       "lX"
#define SCNoLEAST32  "lo"
#define SCNuLEAST32  "lu"
#define SCNxLEAST32  "lx"
#define SCNXLEAST32  "lX"
#define SCNoFAST32   "lo"
#define SCNuFAST32   "lu"
#define SCNxFAST32   "lx"
#define SCNXFAST32   "lX"

#define SCNo64       "I64o"
#define SCNu64       "I64u"
#define SCNx64       "I64x"
#define SCNX64       "I64X"
#define SCNoLEAST64  "I64o"
#define SCNuLEAST64  "I64u"
#define SCNxLEAST64  "I64x"
#define SCNXLEAST64  "I64X"
#define SCNoFAST64   "I64o"
#define SCNuFAST64   "I64u"
#define SCNxFAST64   "I64x"
#define SCNXFAST64   "I64X"

#define SCNoMAX     "I64o"
#define SCNuMAX     "I64u"
#define SCNxMAX     "I64x"
#define SCNXMAX     "I64X"

#ifdef _WIN64 // [
#  define SCNoPTR     "I64o"
#  define SCNuPTR     "I64u"
#  define SCNxPTR     "I64x"
#  define SCNXPTR     "I64X"
#else  // _WIN64 ][
#  define SCNoPTR     "lo"
#  define SCNuPTR     "lu"
#  define SCNxPTR     "lx"
#  define SCNXPTR     "lX"
#endif  // _WIN64 ]

#endif // __STDC_FORMAT_MACROS ]

// 7.8.2 Functions for greatest-width integer types

// 7.8.2.1 The imaxabs function
#define imaxabs _abs64

// 7.8.2.2 The imaxdiv function

// This is modified version of div() function from Microsoft's div.c found
// in %MSVC.NET%\crt\src\div.c
#ifdef STATIC_IMAXDIV // [
static
#else // STATIC_IMAXDIV ][
_inline
#endif // STATIC_IMAXDIV ]
imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
{
   imaxdiv_t result;

   result.quot = numer / denom;
   result.rem = numer % denom;

   if (numer < 0 && result.rem > 0) {
      // did division wrong; must fix up
      ++result.quot;
      result.rem -= denom;
   }

   return result;
}

// 7.8.2.3 The strtoimax and strtoumax functions
#define strtoimax _strtoi64
#define strtoumax _strtoui64

// 7.8.2.4 The wcstoimax and wcstoumax functions
#define wcstoimax _wcstoi64
#define wcstoumax _wcstoui64


#endif // _MSC_INTTYPES_H_ ]
x264-snapshot-20120103-2245-stable/extras/getopt.h0000644000175000001440000001437511700673342020664 0ustar  videolanusers/* Declarations for getopt.
   Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.  */

#ifndef _GETOPT_H

#ifndef __need_getopt
# define _GETOPT_H 1
#endif

/* If __GNU_LIBRARY__ is not already defined, either we are being used
   standalone, or this is the first header included in the source file.
   If we are being used with glibc, we need to include <features.h>, but
   that does not exist if we are standalone.  So: if __GNU_LIBRARY__ is
   not defined, include <ctype.h>, which will pull in <features.h> for us
   if it's from glibc.  (Why ctype.h?  It's guaranteed to exist and it
   doesn't flood the namespace with stuff the way some other headers do.)  */
#if !defined __GNU_LIBRARY__
# include <ctype.h>
#endif

#ifdef	__cplusplus
extern "C" {
#endif

/* For communication from `getopt' to the caller.
   When `getopt' finds an option that takes an argument,
   the argument value is returned here.
   Also, when `ordering' is RETURN_IN_ORDER,
   each non-option ARGV-element is returned here.  */

extern char *optarg;

/* Index in ARGV of the next element to be scanned.
   This is used for communication to and from the caller
   and for communication between successive calls to `getopt'.

   On entry to `getopt', zero means this is the first call; initialize.

   When `getopt' returns -1, this is the index of the first of the
   non-option elements that the caller should itself scan.

   Otherwise, `optind' communicates from one call to the next
   how much of ARGV has been scanned so far.  */

extern int optind;

/* Callers store zero here to inhibit the error message `getopt' prints
   for unrecognized options.  */

extern int opterr;

/* Set to an option character which was unrecognized.  */

extern int optopt;

#ifndef __need_getopt
/* Describe the long-named options requested by the application.
   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
   of `struct option' terminated by an element containing a name which is
   zero.

   The field `has_arg' is:
   no_argument		(or 0) if the option does not take an argument,
   required_argument	(or 1) if the option requires an argument,
   optional_argument 	(or 2) if the option takes an optional argument.

   If the field `flag' is not NULL, it points to a variable that is set
   to the value given in the field `val' when the option is found, but
   left unchanged if the option is not found.

   To have a long-named option do something other than set an `int' to
   a compiled-in constant, such as set a value from `optarg', set the
   option's `flag' field to zero and its `val' field to a nonzero
   value (the equivalent single-letter option character, if there is
   one).  For long options that have a zero `flag' field, `getopt'
   returns the contents of the `val' field.  */

struct option
{
# if (defined __STDC__ && __STDC__) || defined __cplusplus
  const char *name;
# else
  char *name;
# endif
  /* has_arg can't be an enum because some compilers complain about
     type mismatches in all the code that assumes it is an int.  */
  int has_arg;
  int *flag;
  int val;
};

/* Names for the values of the `has_arg' field of `struct option'.  */

# define no_argument		0
# define required_argument	1
# define optional_argument	2
#endif	/* need getopt */


/* Get definitions and prototypes for functions to process the
   arguments in ARGV (ARGC of them, minus the program name) for
   options given in OPTS.

   Return the option character from OPTS just read.  Return -1 when
   there are no more options.  For unrecognized options, or options
   missing arguments, `optopt' is set to the option letter, and '?' is
   returned.

   The OPTS string is a list of characters which are recognized option
   letters, optionally followed by colons, specifying that that letter
   takes an argument, to be placed in `optarg'.

   If a letter in OPTS is followed by two colons, its argument is
   optional.  This behavior is specific to the GNU `getopt'.

   The argument `--' causes premature termination of argument
   scanning, explicitly telling `getopt' that there are no more
   options.

   If OPTS begins with `--', then non-option arguments are treated as
   arguments to the option '\0'.  This behavior is specific to the GNU
   `getopt'.  */

#if (defined __STDC__ && __STDC__) || defined __cplusplus
# ifdef __GNU_LIBRARY__
/* Many other libraries have conflicting prototypes for getopt, with
   differences in the consts, in stdlib.h.  To avoid compilation
   errors, only prototype getopt for the GNU C library.  */
extern int getopt (int __argc, char *const *__argv, const char *__shortopts);
# else /* not __GNU_LIBRARY__ */
extern int getopt ();
# endif /* __GNU_LIBRARY__ */

# ifndef __need_getopt
extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts,
		        const struct option *__longopts, int *__longind);
extern int getopt_long_only (int __argc, char *const *__argv,
			     const char *__shortopts,
		             const struct option *__longopts, int *__longind);

/* Internal only.  Users should not call this directly.  */
extern int _getopt_internal (int __argc, char *const *__argv,
			     const char *__shortopts,
		             const struct option *__longopts, int *__longind,
			     int __long_only);
# endif
#else /* not __STDC__ */
extern int getopt ();
# ifndef __need_getopt
extern int getopt_long ();
extern int getopt_long_only ();

extern int _getopt_internal ();
# endif
#endif /* __STDC__ */

#ifdef	__cplusplus
}
#endif

/* Make sure we later can get all the definitions and declarations.  */
#undef __need_getopt

#endif /* getopt.h */
x264-snapshot-20120103-2245-stable/extras/getopt.c0000644000175000001440000007334511700673342020661 0ustar  videolanusers/* Getopt for GNU.
   NOTE: getopt is now part of the C library, so if you don't know what
   "Keep this file name-space clean" means, talk to drepper@gnu.org
   before changing it!
   Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001
   	Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02111-1307 USA.  */

/* This tells Alpha OSF/1 not to define a getopt prototype in <stdio.h>.
   Ditto for AIX 3.2 and <stdlib.h>.  */
#ifndef _NO_PROTO
# define _NO_PROTO
#endif

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

#if !defined __STDC__ || !__STDC__
/* This is a separate conditional since some stdc systems
   reject `defined (const)'.  */
# ifndef const
#  define const
# endif
#endif

#include <stdio.h>

/* Comment out all this code if we are using the GNU C Library, and are not
   actually compiling the library itself.  This code is part of the GNU C
   Library, but also included in many other GNU distributions.  Compiling
   and linking in this code is a waste when using the GNU C library
   (especially if it is a shared library).  Rather than having every GNU
   program understand `configure --with-gnu-libc' and omit the object files,
   it is simpler to just do this in the source for each such file.  */

#define GETOPT_INTERFACE_VERSION 2
#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2
# include <gnu-versions.h>
# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION
#  define ELIDE_CODE
# endif
#endif

#ifndef ELIDE_CODE


/* This needs to come after some library #include
   to get __GNU_LIBRARY__ defined.  */
#ifdef	__GNU_LIBRARY__
/* Don't include stdlib.h for non-GNU C libraries because some of them
   contain conflicting prototypes for getopt.  */
# include <stdlib.h>
# include <unistd.h>
#endif	/* GNU C library.  */

#ifdef VMS
# include <unixlib.h>
# if HAVE_STRING_H - 0
#  include <string.h>
# endif
#endif

#ifndef _
/* This is for other GNU distributions with internationalized messages.  */
# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
#  include <libintl.h>
#  ifndef _
#   define _(msgid)	gettext (msgid)
#  endif
# else
#  define _(msgid)	(msgid)
# endif
#endif

/* This version of `getopt' appears to the caller like standard Unix `getopt'
   but it behaves differently for the user, since it allows the user
   to intersperse the options with the other arguments.

   As `getopt' works, it permutes the elements of ARGV so that,
   when it is done, all the options precede everything else.  Thus
   all application programs are extended to handle flexible argument order.

   Setting the environment variable POSIXLY_CORRECT disables permutation.
   Then the behavior is completely standard.

   GNU application programs can use a third alternative mode in which
   they can distinguish the relative order of options and other arguments.  */

#include "getopt.h"

/* For communication from `getopt' to the caller.
   When `getopt' finds an option that takes an argument,
   the argument value is returned here.
   Also, when `ordering' is RETURN_IN_ORDER,
   each non-option ARGV-element is returned here.  */

char *optarg;

/* Index in ARGV of the next element to be scanned.
   This is used for communication to and from the caller
   and for communication between successive calls to `getopt'.

   On entry to `getopt', zero means this is the first call; initialize.

   When `getopt' returns -1, this is the index of the first of the
   non-option elements that the caller should itself scan.

   Otherwise, `optind' communicates from one call to the next
   how much of ARGV has been scanned so far.  */

/* 1003.2 says this must be 1 before any call.  */
int optind = 1;

/* Formerly, initialization of getopt depended on optind==0, which
   causes problems with re-calling getopt as programs generally don't
   know that. */

int __getopt_initialized;

/* The next char to be scanned in the option-element
   in which the last option character we returned was found.
   This allows us to pick up the scan where we left off.

   If this is zero, or a null string, it means resume the scan
   by advancing to the next ARGV-element.  */

static char *nextchar;

/* Callers store zero here to inhibit the error message
   for unrecognized options.  */

int opterr = 1;

/* Set to an option character which was unrecognized.
   This must be initialized on some systems to avoid linking in the
   system's own getopt implementation.  */

int optopt = '?';

/* Describe how to deal with options that follow non-option ARGV-elements.

   If the caller did not specify anything,
   the default is REQUIRE_ORDER if the environment variable
   POSIXLY_CORRECT is defined, PERMUTE otherwise.

   REQUIRE_ORDER means don't recognize them as options;
   stop option processing when the first non-option is seen.
   This is what Unix does.
   This mode of operation is selected by either setting the environment
   variable POSIXLY_CORRECT, or using `+' as the first character
   of the list of option characters.

   PERMUTE is the default.  We permute the contents of ARGV as we scan,
   so that eventually all the non-options are at the end.  This allows options
   to be given in any order, even with programs that were not written to
   expect this.

   RETURN_IN_ORDER is an option available to programs that were written
   to expect options and other ARGV-elements in any order and that care about
   the ordering of the two.  We describe each non-option ARGV-element
   as if it were the argument of an option with character code 1.
   Using `-' as the first character of the list of option characters
   selects this mode of operation.

   The special argument `--' forces an end of option-scanning regardless
   of the value of `ordering'.  In the case of RETURN_IN_ORDER, only
   `--' can cause `getopt' to return -1 with `optind' != ARGC.  */

static enum
{
  REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER
} ordering;

/* Value of POSIXLY_CORRECT environment variable.  */
static char *posixly_correct;

#ifdef	__GNU_LIBRARY__
/* We want to avoid inclusion of string.h with non-GNU libraries
   because there are many ways it can cause trouble.
   On some systems, it contains special magic macros that don't work
   in GCC.  */
# include <string.h>
# define my_index	strchr
#else

# if HAVE_STRING_H
#  include <string.h>
# else
#  include <strings.h>
# endif

/* Avoid depending on library functions or files
   whose names are inconsistent.  */

#ifndef getenv
extern char *getenv ();
#endif

static char *
my_index (str, chr)
     const char *str;
     int chr;
{
  while (*str)
    {
      if (*str == chr)
	return (char *) str;
      str++;
    }
  return 0;
}

/* If using GCC, we can safely declare strlen this way.
   If not using GCC, it is ok not to declare it.  */
#ifdef __GNUC__
/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h.
   That was relevant to code that was here before.  */
# if (!defined __STDC__ || !__STDC__) && !defined strlen
/* gcc with -traditional declares the built-in strlen to return int,
   and has done so at least since version 2.4.5. -- rms.  */
extern int strlen (const char *);
# endif /* not __STDC__ */
#endif /* __GNUC__ */

#endif /* not __GNU_LIBRARY__ */

/* Handle permutation of arguments.  */

/* Describe the part of ARGV that contains non-options that have
   been skipped.  `first_nonopt' is the index in ARGV of the first of them;
   `last_nonopt' is the index after the last of them.  */

static int first_nonopt;
static int last_nonopt;

#ifdef _LIBC
/* Stored original parameters.
   XXX This is no good solution.  We should rather copy the args so
   that we can compare them later.  But we must not use malloc(3).  */
extern int __libc_argc;
extern char **__libc_argv;

/* Bash 2.0 gives us an environment variable containing flags
   indicating ARGV elements that should not be considered arguments.  */

# ifdef USE_NONOPTION_FLAGS
/* Defined in getopt_init.c  */
extern char *__getopt_nonoption_flags;

static int nonoption_flags_max_len;
static int nonoption_flags_len;
# endif

# ifdef USE_NONOPTION_FLAGS
#  define SWAP_FLAGS(ch1, ch2) \
  if (nonoption_flags_len > 0)						      \
    {									      \
      char __tmp = __getopt_nonoption_flags[ch1];			      \
      __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2];	      \
      __getopt_nonoption_flags[ch2] = __tmp;				      \
    }
# else
#  define SWAP_FLAGS(ch1, ch2)
# endif
#else	/* !_LIBC */
# define SWAP_FLAGS(ch1, ch2)
#endif	/* _LIBC */

/* Exchange two adjacent subsequences of ARGV.
   One subsequence is elements [first_nonopt,last_nonopt)
   which contains all the non-options that have been skipped so far.
   The other is elements [last_nonopt,optind), which contains all
   the options processed since those non-options were skipped.

   `first_nonopt' and `last_nonopt' are relocated so that they describe
   the new indices of the non-options in ARGV after they are moved.  */

#if defined __STDC__ && __STDC__
static void exchange (char **);
#endif

static void
exchange (argv)
     char **argv;
{
  int bottom = first_nonopt;
  int middle = last_nonopt;
  int top = optind;
  char *tem;

  /* Exchange the shorter segment with the far end of the longer segment.
     That puts the shorter segment into the right place.
     It leaves the longer segment in the right place overall,
     but it consists of two parts that need to be swapped next.  */

#if defined _LIBC && defined USE_NONOPTION_FLAGS
  /* First make sure the handling of the `__getopt_nonoption_flags'
     string can work normally.  Our top argument must be in the range
     of the string.  */
  if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len)
    {
      /* We must extend the array.  The user plays games with us and
	 presents new arguments.  */
      char *new_str = malloc (top + 1);
      if (new_str == NULL)
	nonoption_flags_len = nonoption_flags_max_len = 0;
      else
	{
	  memset (__mempcpy (new_str, __getopt_nonoption_flags,
			     nonoption_flags_max_len),
		  '\0', top + 1 - nonoption_flags_max_len);
	  nonoption_flags_max_len = top + 1;
	  __getopt_nonoption_flags = new_str;
	}
    }
#endif

  while (top > middle && middle > bottom)
    {
      if (top - middle > middle - bottom)
	{
	  /* Bottom segment is the short one.  */
	  int len = middle - bottom;
	  register int i;

	  /* Swap it with the top part of the top segment.  */
	  for (i = 0; i < len; i++)
	    {
	      tem = argv[bottom + i];
	      argv[bottom + i] = argv[top - (middle - bottom) + i];
	      argv[top - (middle - bottom) + i] = tem;
	      SWAP_FLAGS (bottom + i, top - (middle - bottom) + i);
	    }
	  /* Exclude the moved bottom segment from further swapping.  */
	  top -= len;
	}
      else
	{
	  /* Top segment is the short one.  */
	  int len = top - middle;
	  register int i;

	  /* Swap it with the bottom part of the bottom segment.  */
	  for (i = 0; i < len; i++)
	    {
	      tem = argv[bottom + i];
	      argv[bottom + i] = argv[middle + i];
	      argv[middle + i] = tem;
	      SWAP_FLAGS (bottom + i, middle + i);
	    }
	  /* Exclude the moved top segment from further swapping.  */
	  bottom += len;
	}
    }

  /* Update records for the slots the non-options now occupy.  */

  first_nonopt += (optind - last_nonopt);
  last_nonopt = optind;
}

/* Initialize the internal data when the first call is made.  */

#if defined __STDC__ && __STDC__
static const char *_getopt_initialize (int, char *const *, const char *);
#endif
static const char *
_getopt_initialize (argc, argv, optstring)
     int argc;
     char *const *argv;
     const char *optstring;
{
  /* Start processing options with ARGV-element 1 (since ARGV-element 0
     is the program name); the sequence of previously skipped
     non-option ARGV-elements is empty.  */

  first_nonopt = last_nonopt = optind;

  nextchar = NULL;

  posixly_correct = getenv ("POSIXLY_CORRECT");

  /* Determine how to handle the ordering of options and nonoptions.  */

  if (optstring[0] == '-')
    {
      ordering = RETURN_IN_ORDER;
      ++optstring;
    }
  else if (optstring[0] == '+')
    {
      ordering = REQUIRE_ORDER;
      ++optstring;
    }
  else if (posixly_correct != NULL)
    ordering = REQUIRE_ORDER;
  else
    ordering = PERMUTE;

#if defined _LIBC && defined USE_NONOPTION_FLAGS
  if (posixly_correct == NULL
      && argc == __libc_argc && argv == __libc_argv)
    {
      if (nonoption_flags_max_len == 0)
	{
	  if (__getopt_nonoption_flags == NULL
	      || __getopt_nonoption_flags[0] == '\0')
	    nonoption_flags_max_len = -1;
	  else
	    {
	      const char *orig_str = __getopt_nonoption_flags;
	      int len = nonoption_flags_max_len = strlen (orig_str);
	      if (nonoption_flags_max_len < argc)
		nonoption_flags_max_len = argc;
	      __getopt_nonoption_flags =
		(char *) malloc (nonoption_flags_max_len);
	      if (__getopt_nonoption_flags == NULL)
		nonoption_flags_max_len = -1;
	      else
		memset (__mempcpy (__getopt_nonoption_flags, orig_str, len),
			'\0', nonoption_flags_max_len - len);
	    }
	}
      nonoption_flags_len = nonoption_flags_max_len;
    }
  else
    nonoption_flags_len = 0;
#endif

  return optstring;
}

/* Scan elements of ARGV (whose length is ARGC) for option characters
   given in OPTSTRING.

   If an element of ARGV starts with '-', and is not exactly "-" or "--",
   then it is an option element.  The characters of this element
   (aside from the initial '-') are option characters.  If `getopt'
   is called repeatedly, it returns successively each of the option characters
   from each of the option elements.

   If `getopt' finds another option character, it returns that character,
   updating `optind' and `nextchar' so that the next call to `getopt' can
   resume the scan with the following option character or ARGV-element.

   If there are no more option characters, `getopt' returns -1.
   Then `optind' is the index in ARGV of the first ARGV-element
   that is not an option.  (The ARGV-elements have been permuted
   so that those that are not options now come last.)

   OPTSTRING is a string containing the legitimate option characters.
   If an option character is seen that is not listed in OPTSTRING,
   return '?' after printing an error message.  If you set `opterr' to
   zero, the error message is suppressed but we still return '?'.

   If a char in OPTSTRING is followed by a colon, that means it wants an arg,
   so the following text in the same ARGV-element, or the text of the following
   ARGV-element, is returned in `optarg'.  Two colons mean an option that
   wants an optional arg; if there is text in the current ARGV-element,
   it is returned in `optarg', otherwise `optarg' is set to zero.

   If OPTSTRING starts with `-' or `+', it requests different methods of
   handling the non-option ARGV-elements.
   See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above.

   Long-named options begin with `--' instead of `-'.
   Their names may be abbreviated as long as the abbreviation is unique
   or is an exact match for some defined option.  If they have an
   argument, it follows the option name in the same ARGV-element, separated
   from the option name by a `=', or else the in next ARGV-element.
   When `getopt' finds a long-named option, it returns 0 if that option's
   `flag' field is nonzero, the value of the option's `val' field
   if the `flag' field is zero.

   The elements of ARGV aren't really const, because we permute them.
   But we pretend they're const in the prototype to be compatible
   with other systems.

   LONGOPTS is a vector of `struct option' terminated by an
   element containing a name which is zero.

   LONGIND returns the index in LONGOPT of the long-named option found.
   It is only valid when a long-named option has been found by the most
   recent call.

   If LONG_ONLY is nonzero, '-' as well as '--' can introduce
   long-named options.  */

int
_getopt_internal (argc, argv, optstring, longopts, longind, long_only)
     int argc;
     char *const *argv;
     const char *optstring;
     const struct option *longopts;
     int *longind;
     int long_only;
{
  int print_errors = opterr;
  if (optstring[0] == ':')
    print_errors = 0;

  if (argc < 1)
    return -1;

  optarg = NULL;

  if (optind == 0 || !__getopt_initialized)
    {
      if (optind == 0)
	optind = 1;	/* Don't scan ARGV[0], the program name.  */
      optstring = _getopt_initialize (argc, argv, optstring);
      __getopt_initialized = 1;
    }

  /* Test whether ARGV[optind] points to a non-option argument.
     Either it does not have option syntax, or there is an environment flag
     from the shell indicating it is not an option.  The later information
     is only used when the used in the GNU libc.  */
#if defined _LIBC && defined USE_NONOPTION_FLAGS
# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0'	      \
		      || (optind < nonoption_flags_len			      \
			  && __getopt_nonoption_flags[optind] == '1'))
#else
# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0')
#endif

  if (nextchar == NULL || *nextchar == '\0')
    {
      /* Advance to the next ARGV-element.  */

      /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been
	 moved back by the user (who may also have changed the arguments).  */
      if (last_nonopt > optind)
	last_nonopt = optind;
      if (first_nonopt > optind)
	first_nonopt = optind;

      if (ordering == PERMUTE)
	{
	  /* If we have just processed some options following some non-options,
	     exchange them so that the options come first.  */

	  if (first_nonopt != last_nonopt && last_nonopt != optind)
	    exchange ((char **) argv);
	  else if (last_nonopt != optind)
	    first_nonopt = optind;

	  /* Skip any additional non-options
	     and extend the range of non-options previously skipped.  */

	  while (optind < argc && NONOPTION_P)
	    optind++;
	  last_nonopt = optind;
	}

      /* The special ARGV-element `--' means premature end of options.
	 Skip it like a null option,
	 then exchange with previous non-options as if it were an option,
	 then skip everything else like a non-option.  */

      if (optind != argc && !strcmp (argv[optind], "--"))
	{
	  optind++;

	  if (first_nonopt != last_nonopt && last_nonopt != optind)
	    exchange ((char **) argv);
	  else if (first_nonopt == last_nonopt)
	    first_nonopt = optind;
	  last_nonopt = argc;

	  optind = argc;
	}

      /* If we have done all the ARGV-elements, stop the scan
	 and back over any non-options that we skipped and permuted.  */

      if (optind == argc)
	{
	  /* Set the next-arg-index to point at the non-options
	     that we previously skipped, so the caller will digest them.  */
	  if (first_nonopt != last_nonopt)
	    optind = first_nonopt;
	  return -1;
	}

      /* If we have come to a non-option and did not permute it,
	 either stop the scan or describe it to the caller and pass it by.  */

      if (NONOPTION_P)
	{
	  if (ordering == REQUIRE_ORDER)
	    return -1;
	  optarg = argv[optind++];
	  return 1;
	}

      /* We have found another option-ARGV-element.
	 Skip the initial punctuation.  */

      nextchar = (argv[optind] + 1
		  + (longopts != NULL && argv[optind][1] == '-'));
    }

  /* Decode the current option-ARGV-element.  */

  /* Check whether the ARGV-element is a long option.

     If long_only and the ARGV-element has the form "-f", where f is
     a valid short option, don't consider it an abbreviated form of
     a long option that starts with f.  Otherwise there would be no
     way to give the -f short option.

     On the other hand, if there's a long option "fubar" and
     the ARGV-element is "-fu", do consider that an abbreviation of
     the long option, just like "--fu", and not "-f" with arg "u".

     This distinction seems to be the most useful approach.  */

  if (longopts != NULL
      && (argv[optind][1] == '-'
	  || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1])))))
    {
      char *nameend;
      const struct option *p;
      const struct option *pfound = NULL;
      int exact = 0;
      int ambig = 0;
      int indfound = -1;
      int option_index;

      for (nameend = nextchar; *nameend && *nameend != '='; nameend++)
	/* Do nothing.  */ ;

      /* Test all long options for either exact match
	 or abbreviated matches.  */
      for (p = longopts, option_index = 0; p->name; p++, option_index++)
	if (!strncmp (p->name, nextchar, nameend - nextchar))
	  {
	    if ((unsigned int) (nameend - nextchar)
		== (unsigned int) strlen (p->name))
	      {
		/* Exact match found.  */
		pfound = p;
		indfound = option_index;
		exact = 1;
		break;
	      }
	    else if (pfound == NULL)
	      {
		/* First nonexact match found.  */
		pfound = p;
		indfound = option_index;
	      }
	    else if (long_only
		     || pfound->has_arg != p->has_arg
		     || pfound->flag != p->flag
		     || pfound->val != p->val)
	      /* Second or later nonexact match found.  */
	      ambig = 1;
	  }

      if (ambig && !exact)
	{
	  if (print_errors)
	    fprintf (stderr, _("%s: option `%s' is ambiguous\n"),
		     argv[0], argv[optind]);
	  nextchar += strlen (nextchar);
	  optind++;
	  optopt = 0;
	  return '?';
	}

      if (pfound != NULL)
	{
	  option_index = indfound;
	  optind++;
	  if (*nameend)
	    {
	      /* Don't test has_arg with >, because some C compilers don't
		 allow it to be used on enums.  */
	      if (pfound->has_arg)
		optarg = nameend + 1;
	      else
		{
		  if (print_errors)
		    {
		      if (argv[optind - 1][1] == '-')
			/* --option */
			fprintf (stderr,
				 _("%s: option `--%s' doesn't allow an argument\n"),
				 argv[0], pfound->name);
		      else
			/* +option or -option */
			fprintf (stderr,
				 _("%s: option `%c%s' doesn't allow an argument\n"),
				 argv[0], argv[optind - 1][0], pfound->name);
		    }

		  nextchar += strlen (nextchar);

		  optopt = pfound->val;
		  return '?';
		}
	    }
	  else if (pfound->has_arg == 1)
	    {
	      if (optind < argc)
		optarg = argv[optind++];
	      else
		{
		  if (print_errors)
		    fprintf (stderr,
			   _("%s: option `%s' requires an argument\n"),
			   argv[0], argv[optind - 1]);
		  nextchar += strlen (nextchar);
		  optopt = pfound->val;
		  return optstring[0] == ':' ? ':' : '?';
		}
	    }
	  nextchar += strlen (nextchar);
	  if (longind != NULL)
	    *longind = option_index;
	  if (pfound->flag)
	    {
	      *(pfound->flag) = pfound->val;
	      return 0;
	    }
	  return pfound->val;
	}

      /* Can't find it as a long option.  If this is not getopt_long_only,
	 or the option starts with '--' or is not a valid short
	 option, then it's an error.
	 Otherwise interpret it as a short option.  */
      if (!long_only || argv[optind][1] == '-'
	  || my_index (optstring, *nextchar) == NULL)
	{
	  if (print_errors)
	    {
	      if (argv[optind][1] == '-')
		/* --option */
		fprintf (stderr, _("%s: unrecognized option `--%s'\n"),
			 argv[0], nextchar);
	      else
		/* +option or -option */
		fprintf (stderr, _("%s: unrecognized option `%c%s'\n"),
			 argv[0], argv[optind][0], nextchar);
	    }
	  nextchar = (char *) "";
	  optind++;
	  optopt = 0;
	  return '?';
	}
    }

  /* Look at and handle the next short option-character.  */

  {
    char c = *nextchar++;
    char *temp = my_index (optstring, c);

    /* Increment `optind' when we start to process its last character.  */
    if (*nextchar == '\0')
      ++optind;

    if (temp == NULL || c == ':')
      {
	if (print_errors)
	  {
	    if (posixly_correct)
	      /* 1003.2 specifies the format of this message.  */
	      fprintf (stderr, _("%s: illegal option -- %c\n"),
		       argv[0], c);
	    else
	      fprintf (stderr, _("%s: invalid option -- %c\n"),
		       argv[0], c);
	  }
	optopt = c;
	return '?';
      }
    /* Convenience. Treat POSIX -W foo same as long option --foo */
    if (temp[0] == 'W' && temp[1] == ';')
      {
	char *nameend;
	const struct option *p;
	const struct option *pfound = NULL;
	int exact = 0;
	int ambig = 0;
	int indfound = 0;
	int option_index;

	/* This is an option that requires an argument.  */
	if (*nextchar != '\0')
	  {
	    optarg = nextchar;
	    /* If we end this ARGV-element by taking the rest as an arg,
	       we must advance to the next element now.  */
	    optind++;
	  }
	else if (optind == argc)
	  {
	    if (print_errors)
	      {
		/* 1003.2 specifies the format of this message.  */
		fprintf (stderr, _("%s: option requires an argument -- %c\n"),
			 argv[0], c);
	      }
	    optopt = c;
	    if (optstring[0] == ':')
	      c = ':';
	    else
	      c = '?';
	    return c;
	  }
	else
	  /* We already incremented `optind' once;
	     increment it again when taking next ARGV-elt as argument.  */
	  optarg = argv[optind++];

	/* optarg is now the argument, see if it's in the
	   table of longopts.  */

	for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++)
	  /* Do nothing.  */ ;

	/* Test all long options for either exact match
	   or abbreviated matches.  */
	for (p = longopts, option_index = 0; p->name; p++, option_index++)
	  if (!strncmp (p->name, nextchar, nameend - nextchar))
	    {
	      if ((unsigned int) (nameend - nextchar) == strlen (p->name))
		{
		  /* Exact match found.  */
		  pfound = p;
		  indfound = option_index;
		  exact = 1;
		  break;
		}
	      else if (pfound == NULL)
		{
		  /* First nonexact match found.  */
		  pfound = p;
		  indfound = option_index;
		}
	      else
		/* Second or later nonexact match found.  */
		ambig = 1;
	    }
	if (ambig && !exact)
	  {
	    if (print_errors)
	      fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"),
		       argv[0], argv[optind]);
	    nextchar += strlen (nextchar);
	    optind++;
	    return '?';
	  }
	if (pfound != NULL)
	  {
	    option_index = indfound;
	    if (*nameend)
	      {
		/* Don't test has_arg with >, because some C compilers don't
		   allow it to be used on enums.  */
		if (pfound->has_arg)
		  optarg = nameend + 1;
		else
		  {
		    if (print_errors)
		      fprintf (stderr, _("\
%s: option `-W %s' doesn't allow an argument\n"),
			       argv[0], pfound->name);

		    nextchar += strlen (nextchar);
		    return '?';
		  }
	      }
	    else if (pfound->has_arg == 1)
	      {
		if (optind < argc)
		  optarg = argv[optind++];
		else
		  {
		    if (print_errors)
		      fprintf (stderr,
			       _("%s: option `%s' requires an argument\n"),
			       argv[0], argv[optind - 1]);
		    nextchar += strlen (nextchar);
		    return optstring[0] == ':' ? ':' : '?';
		  }
	      }
	    nextchar += strlen (nextchar);
	    if (longind != NULL)
	      *longind = option_index;
	    if (pfound->flag)
	      {
		*(pfound->flag) = pfound->val;
		return 0;
	      }
	    return pfound->val;
	  }
	  nextchar = NULL;
	  return 'W';	/* Let the application handle it.   */
      }
    if (temp[1] == ':')
      {
	if (temp[2] == ':')
	  {
	    /* This is an option that accepts an argument optionally.  */
	    if (*nextchar != '\0')
	      {
		optarg = nextchar;
		optind++;
	      }
	    else
	      optarg = NULL;
	    nextchar = NULL;
	  }
	else
	  {
	    /* This is an option that requires an argument.  */
	    if (*nextchar != '\0')
	      {
		optarg = nextchar;
		/* If we end this ARGV-element by taking the rest as an arg,
		   we must advance to the next element now.  */
		optind++;
	      }
	    else if (optind == argc)
	      {
		if (print_errors)
		  {
		    /* 1003.2 specifies the format of this message.  */
		    fprintf (stderr,
			     _("%s: option requires an argument -- %c\n"),
			     argv[0], c);
		  }
		optopt = c;
		if (optstring[0] == ':')
		  c = ':';
		else
		  c = '?';
	      }
	    else
	      /* We already incremented `optind' once;
		 increment it again when taking next ARGV-elt as argument.  */
	      optarg = argv[optind++];
	    nextchar = NULL;
	  }
      }
    return c;
  }
}

int
getopt (argc, argv, optstring)
     int argc;
     char *const *argv;
     const char *optstring;
{
  return _getopt_internal (argc, argv, optstring,
			   (const struct option *) 0,
			   (int *) 0,
			   0);
}

int getopt_long (argc, argv, optstring, long_options, opt_index)
     int argc;
     char *const *argv;
     const char *optstring;
     const struct option *long_options;
     int *opt_index;
{
  return _getopt_internal (argc, argv, optstring, long_options, opt_index, 0);
}

#endif	/* Not ELIDE_CODE.  */

#ifdef TEST

/* Compile with -DTEST to make an executable for use in testing
   the above definition of `getopt'.  */

int
main (argc, argv)
     int argc;
     char **argv;
{
  int c;
  int digit_optind = 0;

  while (1)
    {
      int this_option_optind = optind ? optind : 1;

      c = getopt (argc, argv, "abc:d:0123456789");
      if (c == -1)
	break;

      switch (c)
	{
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	  if (digit_optind != 0 && digit_optind != this_option_optind)
	    printf ("digits occur in two different argv-elements.\n");
	  digit_optind = this_option_optind;
	  printf ("option %c\n", c);
	  break;

	case 'a':
	  printf ("option a\n");
	  break;

	case 'b':
	  printf ("option b\n");
	  break;

	case 'c':
	  printf ("option c with value `%s'\n", optarg);
	  break;

	case '?':
	  break;

	default:
	  printf ("?? getopt returned character code 0%o ??\n", c);
	}
    }

  if (optind < argc)
    {
      printf ("non-option ARGV-elements: ");
      while (optind < argc)
	printf ("%s ", argv[optind++]);
      printf ("\n");
    }

  exit (0);
}

#endif /* TEST */
x264-snapshot-20120103-2245-stable/extras/gas-preprocessor.pl0000755000175000001440000001774111700673342023047 0ustar  videolanusers#!/usr/bin/env perl
# by David Conrad
# This code is licensed under GPLv2 or later; go to gnu.org to read it
#  (not that it much matters for an asm preprocessor)
# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
use strict;

# Apple's gas is ancient and doesn't support modern preprocessing features like
# .rept and has ugly macro syntax, among other things. Thus, this script
# implements the subset of the gas preprocessor used by x264 and ffmpeg
# that isn't supported by Apple's gas.

my @gcc_cmd = @ARGV;
my @preprocess_c_cmd;

if (grep /\.c$/, @gcc_cmd) {
    # C file (inline asm?) - compile
    @preprocess_c_cmd = (@gcc_cmd, "-S");
} elsif (grep /\.S$/, @gcc_cmd) {
    # asm file, just do C preprocessor
    @preprocess_c_cmd = (@gcc_cmd, "-E");
} else {
    die "Unrecognized input filetype";
}
@gcc_cmd = map { /\.[cS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd;
@preprocess_c_cmd = map { /\.o$/ ? "-" : $_ } @preprocess_c_cmd;

open(ASMFILE, "-|", @preprocess_c_cmd) || die "Error running preprocessor";

my $current_macro = '';
my %macro_lines;
my %macro_args;
my %macro_args_default;

my @pass1_lines;

# pass 1: parse .macro
# note that the handling of arguments is probably overly permissive vs. gas
# but it should be the same for valid cases
while (<ASMFILE>) {
    # comment out unsupported directives
    s/\.type/@.type/x;
    s/\.func/@.func/x;
    s/\.endfunc/@.endfunc/x;
    s/\.ltorg/@.ltorg/x;
    s/\.size/@.size/x;
    s/\.fpu/@.fpu/x;

    # the syntax for these is a little different
    s/\.global/.globl/x;
    # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
    s/(.*)\.rodata/.const_data/x;
    s/\.int/.long/x;
    s/\.float/.single/x;

    # catch unknown section names that aren't mach-o style (with a comma)
    if (/.section ([^,]*)$/) {
        die ".section $1 unsupported; figure out the mach-o section name and add it";
    }

    # macros creating macros is not handled (is that valid?)
    if (/\.macro\s+([\d\w\.]+)\s*(.*)/) {
        $current_macro = $1;

        # commas in the argument list are optional, so only use whitespace as the separator
        my $arglist = $2;
        $arglist =~ s/,/ /g;

        my @args = split(/\s+/, $arglist);
        foreach my $i (0 .. $#args) {
            my @argpair = split(/=/, $args[$i]);
            $macro_args{$current_macro}[$i] = $argpair[0];
            $argpair[0] =~ s/:vararg$//;
            $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
        }
        # ensure %macro_lines has the macro name added as a key
        $macro_lines{$current_macro} = [];
    } elsif (/\.endm/) {
        if (!$current_macro) {
            die "ERROR: .endm without .macro";
        }
        $current_macro = '';
    } elsif ($current_macro) {
        push(@{$macro_lines{$current_macro}}, $_);
    } else {
        expand_macros($_);
    }
}

sub expand_macros {
    my $line = @_[0];
    if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
        push(@pass1_lines, $1);
        my $macro = $2;

        # commas are optional here too, but are syntactically important because
        # parameters can be blank
        my @arglist = split(/,/, $3);
        my @args;
        foreach (@arglist) {
            my @whitespace_split = split(/\s+/, $_);
            if (!@whitespace_split) {
                push(@args, '');
            } else {
                foreach (@whitespace_split) {
                    if (length($_)) {
                        push(@args, $_);
                    }
                }
            }
        }

        my %replacements;
        if ($macro_args_default{$macro}){
            %replacements = %{$macro_args_default{$macro}};
        }

        # construct hashtable of text to replace
        foreach my $i (0 .. $#args) {
            my $argname = $macro_args{$macro}[$i];

            if ($args[$i] =~ m/=/) {
                # arg=val references the argument name
                # XXX: I'm not sure what the expected behaviour if a lot of
                # these are mixed with unnamed args
                my @named_arg = split(/=/, $args[$i]);
                $replacements{$named_arg[0]} = $named_arg[1];
            } elsif ($i > $#{$macro_args{$macro}}) {
                # more args given than the macro has named args
                # XXX: is vararg allowed on arguments before the last?
                $argname = $macro_args{$macro}[-1];
                if ($argname =~ s/:vararg$//) {
                    $replacements{$argname} .= ", $args[$i]";
                } else {
                    die "Too many arguments to macro $macro";
                }
            } else {
                $argname =~ s/:vararg$//;
                $replacements{$argname} = $args[$i];
            }
        }

        # apply replacements as regex
        foreach (@{$macro_lines{$macro}}) {
            my $macro_line = $_;
            # do replacements by longest first, this avoids wrong replacement
            # when argument names are subsets of each other
            foreach (reverse sort {length $a <=> length $b} keys %replacements) {
                $macro_line =~ s/\\$_/$replacements{$_}/g;
            }
            $macro_line =~ s/\\\(\)//g;     # remove \()
            expand_macros($macro_line);
        }
    } else {
        push(@pass1_lines, $line);
    }
}

close(ASMFILE) or exit 1;
open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler";

my @sections;
my $num_repts;
my $rept_lines;

my %literal_labels;     # for ldr <reg>, =<expr>
my $literal_num = 0;

# pass 2: parse .rept and .if variants
# NOTE: since we don't implement a proper parser, using .rept with a
# variable assigned from .set is not supported
foreach my $line (@pass1_lines) {
    # textual comparison .if
    # this assumes nothing else on the same line
    if ($line =~ /\.ifnb\s+(.*)/) {
        if ($1) {
            $line = ".if 1\n";
        } else {
            $line = ".if 0\n";
        }
    } elsif ($line =~ /\.ifb\s+(.*)/) {
        if ($1) {
            $line = ".if 0\n";
        } else {
            $line = ".if 1\n";
        }
    } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) {
        if ($1 eq $2) {
            $line = ".if 1\n";
        } else {
            $line = ".if 0\n";
        }
    }

    # handle .previous (only with regard to .section not .subsection)
    if ($line =~ /\.(section|text|const_data)/) {
        push(@sections, $line);
    } elsif ($line =~ /\.previous/) {
        if (!$sections[-2]) {
            die ".previous without a previous section";
        }
        $line = $sections[-2];
        push(@sections, $line);
    }

    # handle ldr <reg>, =<expr>
    if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/) {
        my $label = $literal_labels{$3};
        if (!$label) {
            $label = ".Literal_$literal_num";
            $literal_num++;
            $literal_labels{$3} = $label;
        }
        $line = "$1 ldr$2, $label\n";
    } elsif ($line =~ /\.ltorg/) {
        foreach my $literal (keys %literal_labels) {
            $line .= "$literal_labels{$literal}:\n .word $literal\n";
        }
        %literal_labels = ();
    }

    # @l -> lo16()  @ha -> ha16()
    $line =~ s/,\s+([^,]+)\@l(\s)/, lo16($1)$2/g;
    $line =~ s/,\s+([^,]+)\@ha(\s)/, ha16($1)$2/g;

    if ($line =~ /\.rept\s+(.*)/) {
        $num_repts = $1;
        $rept_lines = "\n";

        # handle the possibility of repeating another directive on the same line
        # .endr on the same line is not valid, I don't know if a non-directive is
        if ($num_repts =~ s/(\.\w+.*)//) {
            $rept_lines .= "$1\n";
        }
        $num_repts = eval($num_repts);
    } elsif ($line =~ /\.endr/) {
        for (1 .. $num_repts) {
            print ASMFILE $rept_lines;
        }
        $rept_lines = '';
    } elsif ($rept_lines) {
        $rept_lines .= $line;
    } else {
        print ASMFILE $line;
    }
}

print ASMFILE ".text\n";
foreach my $literal (keys %literal_labels) {
    print ASMFILE "$literal_labels{$literal}:\n .word $literal\n";
}

close(ASMFILE) or exit 1;
x264-snapshot-20120103-2245-stable/extras/avisynth_c.h0000644000175000001440000010110411700673342021514 0ustar  videolanusers// Avisynth C Interface Version 0.20
// Copyright 2003 Kevin Atkinson

// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
// http://www.gnu.org/copyleft/gpl.html .
//
// As a special exception, I give you permission to link to the
// Avisynth C interface with independent modules that communicate with
// the Avisynth C interface solely through the interfaces defined in
// avisynth_c.h, regardless of the license terms of these independent
// modules, and to copy and distribute the resulting combined work
// under terms of your choice, provided that every copy of the
// combined work is accompanied by a complete copy of the source code
// of the Avisynth C interface and Avisynth itself (with the version
// used to produce the combined work), being distributed under the
// terms of the GNU General Public License plus this exception.  An
// independent module is a module which is not derived from or based
// on Avisynth C Interface, such as 3rd-party filters, import and
// export plugins, or graphical user interfaces.

// NOTE: this is a partial update of the Avisynth C interface to recognize
// new color spaces added in Avisynth 2.60. By no means is this document
// completely Avisynth 2.60 compliant.

#ifndef __AVISYNTH_C__
#define __AVISYNTH_C__

#ifdef __cplusplus
#  define EXTERN_C extern "C"
#else
#  define EXTERN_C
#endif

#define AVSC_USE_STDCALL 1

#ifndef AVSC_USE_STDCALL
#  define AVSC_CC __cdecl
#else
#  define AVSC_CC __stdcall
#endif

#define AVSC_INLINE static __inline

#ifdef AVISYNTH_C_EXPORTS
#  define AVSC_EXPORT EXTERN_C
#  define AVSC_API(ret, name) EXTERN_C __declspec(dllexport) ret AVSC_CC name
#else
#  define AVSC_EXPORT EXTERN_C __declspec(dllexport)
#  ifndef AVSC_NO_DECLSPEC
#    define AVSC_API(ret, name) EXTERN_C __declspec(dllimport) ret AVSC_CC name
#  else
#    define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func)
#  endif
#endif

typedef unsigned char BYTE;
#ifdef __GNUC__
typedef long long int INT64;
#else
typedef __int64 INT64;
#endif


/////////////////////////////////////////////////////////////////////
//
// Constants
//

#ifndef __AVISYNTH_H__
enum { AVISYNTH_INTERFACE_VERSION = 4 };
#endif

enum {AVS_SAMPLE_INT8  = 1<<0,
      AVS_SAMPLE_INT16 = 1<<1, 
      AVS_SAMPLE_INT24 = 1<<2,
      AVS_SAMPLE_INT32 = 1<<3,
      AVS_SAMPLE_FLOAT = 1<<4};

enum {AVS_PLANAR_Y=1<<0,
      AVS_PLANAR_U=1<<1,
      AVS_PLANAR_V=1<<2,
      AVS_PLANAR_ALIGNED=1<<3,
      AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED,
      AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED,
      AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED,
      AVS_PLANAR_A=1<<4,
      AVS_PLANAR_R=1<<5,
      AVS_PLANAR_G=1<<6,
      AVS_PLANAR_B=1<<7,
      AVS_PLANAR_A_ALIGNED=AVS_PLANAR_A|AVS_PLANAR_ALIGNED,
      AVS_PLANAR_R_ALIGNED=AVS_PLANAR_R|AVS_PLANAR_ALIGNED,
      AVS_PLANAR_G_ALIGNED=AVS_PLANAR_G|AVS_PLANAR_ALIGNED,
      AVS_PLANAR_B_ALIGNED=AVS_PLANAR_B|AVS_PLANAR_ALIGNED};

  // Colorspace properties.
enum {AVS_CS_BGR = 1<<28,
      AVS_CS_YUV = 1<<29,
      AVS_CS_INTERLEAVED = 1<<30,
      AVS_CS_PLANAR = 1<<31,
    
      AVS_CS_SHIFT_SUB_WIDTH   = 0,
      AVS_CS_SHIFT_SUB_HEIGHT  = 1 << 3,
      AVS_CS_SHIFT_SAMPLE_BITS = 1 << 4,

      AVS_CS_SUB_WIDTH_MASK    = 7 << AVS_CS_SHIFT_SUB_WIDTH,
      AVS_CS_SUB_WIDTH_1       = 3 << AVS_CS_SHIFT_SUB_WIDTH, // YV24
      AVS_CS_SUB_WIDTH_2       = 0 << AVS_CS_SHIFT_SUB_WIDTH, // YV12, I420, YV16
      AVS_CS_SUB_WIDTH_4       = 1 << AVS_CS_SHIFT_SUB_WIDTH, // YUV9, YV411

      AVS_CS_VPLANEFIRST       = 1 << 3, // YV12, YV16, YV24, YV411, YUV9
      AVS_CS_UPLANEFIRST       = 1 << 4, // I420

      AVS_CS_SUB_HEIGHT_MASK   = 7 << AVS_CS_SHIFT_SUB_HEIGHT,
      AVS_CS_SUB_HEIGHT_1      = 3 << AVS_CS_SHIFT_SUB_HEIGHT, // YV16, YV24, YV411
      AVS_CS_SUB_HEIGHT_2      = 0 << AVS_CS_SHIFT_SUB_HEIGHT, // YV12, I420
      AVS_CS_SUB_HEIGHT_4      = 1 << AVS_CS_SHIFT_SUB_HEIGHT, // YUV9

      AVS_CS_SAMPLE_BITS_MASK  = 7 << AVS_CS_SHIFT_SAMPLE_BITS,
      AVS_CS_SAMPLE_BITS_8     = 0 << AVS_CS_SHIFT_SAMPLE_BITS,
      AVS_CS_SAMPLE_BITS_16    = 1 << AVS_CS_SHIFT_SAMPLE_BITS,
      AVS_CS_SAMPLE_BITS_32    = 2 << AVS_CS_SHIFT_SAMPLE_BITS,

      AVS_CS_PLANAR_MASK       = AVS_CS_PLANAR | AVS_CS_INTERLEAVED | AVS_CS_YUV | AVS_CS_BGR | AVS_CS_SAMPLE_BITS_MASK | AVS_CS_SUB_HEIGHT_MASK | AVS_CS_SUB_WIDTH_MASK,
      AVS_CS_PLANAR_FILTER     = ~( AVS_CS_VPLANEFIRST | AVS_CS_UPLANEFIRST )};

  // Specific colorformats
enum {
  AVS_CS_UNKNOWN = 0,
  AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
  AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
  AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED,
  //  AVS_CS_YV12  = 1<<3  Reserved
  //  AVS_CS_I420  = 1<<4  Reserved
  AVS_CS_RAW32 = 1<<5 | AVS_CS_INTERLEAVED,

  AVS_CS_YV24  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_1,  // YVU 4:4:4 planar
  AVS_CS_YV16  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_2,  // YVU 4:2:2 planar
  AVS_CS_YV12  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_2 | AVS_CS_SUB_WIDTH_2,  // YVU 4:2:0 planar
  AVS_CS_I420  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_UPLANEFIRST | AVS_CS_SUB_HEIGHT_2 | AVS_CS_SUB_WIDTH_2,  // YUV 4:2:0 planar
  AVS_CS_IYUV  = AVS_CS_I420,
  AVS_CS_YV411 = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_4,  // YVU 4:1:1 planar
  AVS_CS_YUV9  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_4 | AVS_CS_SUB_WIDTH_4,  // YVU 4:1:0 planar
  AVS_CS_Y8    = AVS_CS_PLANAR | AVS_CS_INTERLEAVED | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8                                              // Y   4:0:0 planar
};

enum {
  AVS_IT_BFF = 1<<0,
  AVS_IT_TFF = 1<<1,
  AVS_IT_FIELDBASED = 1<<2};

enum {
  AVS_FILTER_TYPE=1,
  AVS_FILTER_INPUT_COLORSPACE=2,
  AVS_FILTER_OUTPUT_TYPE=9,
  AVS_FILTER_NAME=4,
  AVS_FILTER_AUTHOR=5,
  AVS_FILTER_VERSION=6,
  AVS_FILTER_ARGS=7,
  AVS_FILTER_ARGS_INFO=8,
  AVS_FILTER_ARGS_DESCRIPTION=10,
  AVS_FILTER_DESCRIPTION=11};

enum {  //SUBTYPES
  AVS_FILTER_TYPE_AUDIO=1,
  AVS_FILTER_TYPE_VIDEO=2,
  AVS_FILTER_OUTPUT_TYPE_SAME=3,
  AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4};

enum {
  AVS_CACHE_NOTHING=0,
  AVS_CACHE_RANGE=1,
  AVS_CACHE_ALL=2,
  AVS_CACHE_AUDIO=3,
  AVS_CACHE_AUDIO_NONE=4,
  AVS_CACHE_AUDIO_AUTO=5
  };

#define AVS_FRAME_ALIGN 16 

typedef struct AVS_Clip AVS_Clip;
typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment;

/////////////////////////////////////////////////////////////////////
//
// AVS_VideoInfo
//

// AVS_VideoInfo is layed out identicly to VideoInfo
typedef struct AVS_VideoInfo {
  int width, height;    // width=0 means no video
  unsigned fps_numerator, fps_denominator;
  int num_frames;

  int pixel_type;
  
  int audio_samples_per_second;   // 0 means no audio
  int sample_type;
  INT64 num_audio_samples;
  int nchannels;

  // Imagetype properties

  int image_type;
} AVS_VideoInfo;

// useful functions of the above
AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p) 
        { return (p->width!=0); }

AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p) 
        { return (p->audio_samples_per_second!=0); }

AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p) 
        { return !!(p->pixel_type&AVS_CS_BGR); }

AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p) 
        { return (p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24; } // Clear out additional properties

AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p) 
        { return (p->pixel_type & AVS_CS_BGR32) == AVS_CS_BGR32 ; }

AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p) 
        { return !!(p->pixel_type&AVS_CS_YUV ); }

AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p) 
        { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; }  

AVSC_INLINE int avs_is_yv24(const AVS_VideoInfo * p)
        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV24  & AVS_CS_PLANAR_FILTER); }

AVSC_INLINE int avs_is_yv16(const AVS_VideoInfo * p)
        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV16  & AVS_CS_PLANAR_FILTER); }

AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p) 
        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV12  & AVS_CS_PLANAR_FILTER); }

AVSC_INLINE int avs_is_yv411(const AVS_VideoInfo * p)
        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV411 & AVS_CS_PLANAR_FILTER); }
	
AVSC_INLINE int avs_is_y8(const AVS_VideoInfo * p)
        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_Y8    & AVS_CS_PLANAR_FILTER); }

AVSC_INLINE int avs_is_property(const AVS_VideoInfo * p, int property) 
        { return ((p->pixel_type & property)==property ); }

AVSC_INLINE int avs_is_planar(const AVS_VideoInfo * p) 
        { return !!(p->pixel_type & AVS_CS_PLANAR); }

AVSC_INLINE int avs_is_color_space(const AVS_VideoInfo * p, int c_space)
        { return avs_is_planar(p) ? ((p->pixel_type & AVS_CS_PLANAR_MASK) == (c_space & AVS_CS_PLANAR_FILTER)) : ((p->pixel_type & c_space) == c_space); }
        
AVSC_INLINE int avs_is_field_based(const AVS_VideoInfo * p) 
        { return !!(p->image_type & AVS_IT_FIELDBASED); }

AVSC_INLINE int avs_is_parity_known(const AVS_VideoInfo * p) 
        { return ((p->image_type & AVS_IT_FIELDBASED)&&(p->image_type & (AVS_IT_BFF | AVS_IT_TFF))); }

AVSC_INLINE int avs_is_bff(const AVS_VideoInfo * p) 
        { return !!(p->image_type & AVS_IT_BFF); }

AVSC_INLINE int avs_is_tff(const AVS_VideoInfo * p) 
        { return !!(p->image_type & AVS_IT_TFF); }

AVSC_INLINE int avs_bits_per_pixel(const AVS_VideoInfo * p) 
{ 
  switch (p->pixel_type) {
      case AVS_CS_BGR24: return 24;
      case AVS_CS_BGR32: return 32;
      case AVS_CS_YUY2:  return 16;
      case AVS_CS_YV12:
      case AVS_CS_I420:  return 12;
      default:           return 0;
    }
}
AVSC_INLINE int avs_bytes_from_pixels(const AVS_VideoInfo * p, int pixels) 
        { return pixels * (avs_bits_per_pixel(p)>>3); }   // Will work on planar images, but will return only luma planes

AVSC_INLINE int avs_row_size(const AVS_VideoInfo * p) 
        { return avs_bytes_from_pixels(p,p->width); }  // Also only returns first plane on planar images

AVSC_INLINE int avs_bmp_size(const AVS_VideoInfo * vi)                
        { if (avs_is_planar(vi)) {int p = vi->height * ((avs_row_size(vi)+3) & ~3); p+=p>>1; return p;  } return vi->height * ((avs_row_size(vi)+3) & ~3); }

AVSC_INLINE int avs_samples_per_second(const AVS_VideoInfo * p) 
        { return p->audio_samples_per_second; }


AVSC_INLINE int avs_bytes_per_channel_sample(const AVS_VideoInfo * p) 
{
    switch (p->sample_type) {
      case AVS_SAMPLE_INT8:  return sizeof(signed char);
      case AVS_SAMPLE_INT16: return sizeof(signed short);
      case AVS_SAMPLE_INT24: return 3;
      case AVS_SAMPLE_INT32: return sizeof(signed int);
      case AVS_SAMPLE_FLOAT: return sizeof(float);
      default: return 0;
    }
}
AVSC_INLINE int avs_bytes_per_audio_sample(const AVS_VideoInfo * p)   
        { return p->nchannels*avs_bytes_per_channel_sample(p);}

AVSC_INLINE INT64 avs_audio_samples_from_frames(const AVS_VideoInfo * p, INT64 frames) 
        { return ((INT64)(frames) * p->audio_samples_per_second * p->fps_denominator / p->fps_numerator); }

AVSC_INLINE int avs_frames_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) 
        { return (int)(samples * (INT64)p->fps_numerator / (INT64)p->fps_denominator / (INT64)p->audio_samples_per_second); }

AVSC_INLINE INT64 avs_audio_samples_from_bytes(const AVS_VideoInfo * p, INT64 bytes) 
        { return bytes / avs_bytes_per_audio_sample(p); }

AVSC_INLINE INT64 avs_bytes_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) 
        { return samples * avs_bytes_per_audio_sample(p); }

AVSC_INLINE int avs_audio_channels(const AVS_VideoInfo * p) 
        { return p->nchannels; }

AVSC_INLINE int avs_sample_type(const AVS_VideoInfo * p)
        { return p->sample_type;}

// useful mutator
AVSC_INLINE void avs_set_property(AVS_VideoInfo * p, int property)  
        { p->image_type|=property; }

AVSC_INLINE void avs_clear_property(AVS_VideoInfo * p, int property)  
        { p->image_type&=~property; }

AVSC_INLINE void avs_set_field_based(AVS_VideoInfo * p, int isfieldbased)  
        { if (isfieldbased) p->image_type|=AVS_IT_FIELDBASED; else p->image_type&=~AVS_IT_FIELDBASED; }

AVSC_INLINE void avs_set_fps(AVS_VideoInfo * p, unsigned numerator, unsigned denominator) 
{
    unsigned x=numerator, y=denominator;
    while (y) {   // find gcd
      unsigned t = x%y; x = y; y = t;
    }
    p->fps_numerator = numerator/x;
    p->fps_denominator = denominator/x;
}

AVSC_INLINE int avs_is_same_colorspace(AVS_VideoInfo * x, AVS_VideoInfo * y)
{
        return (x->pixel_type == y->pixel_type)
                || (avs_is_yv12(x) && avs_is_yv12(y));
}

/////////////////////////////////////////////////////////////////////
//
// AVS_VideoFrame
//

// VideoFrameBuffer holds information about a memory block which is used
// for video data.  For efficiency, instances of this class are not deleted
// when the refcount reaches zero; instead they're stored in a linked list
// to be reused.  The instances are deleted when the corresponding AVS
// file is closed.

// AVS_VideoFrameBuffer is layed out identicly to VideoFrameBuffer
// DO NOT USE THIS STRUCTURE DIRECTLY
typedef struct AVS_VideoFrameBuffer {
  BYTE * data;
  int data_size;
  // sequence_number is incremented every time the buffer is changed, so
  // that stale views can tell they're no longer valid.
  volatile long sequence_number;

  volatile long refcount;
} AVS_VideoFrameBuffer;

// VideoFrame holds a "window" into a VideoFrameBuffer.

// AVS_VideoFrame is layed out identicly to IVideoFrame
// DO NOT USE THIS STRUCTURE DIRECTLY
typedef struct AVS_VideoFrame {
  volatile long refcount;
  AVS_VideoFrameBuffer * vfb;
  int offset, pitch, row_size, height, offsetU, offsetV, pitchUV;  // U&V offsets are from top of picture.
  int row_sizeUV, heightUV;
} AVS_VideoFrame;

// Access functions for AVS_VideoFrame
AVSC_INLINE int avs_get_pitch(const AVS_VideoFrame * p) {
        return p->pitch;}

AVSC_INLINE int avs_get_pitch_p(const AVS_VideoFrame * p, int plane) { 
  switch (plane) {
  case AVS_PLANAR_U: case AVS_PLANAR_V: return p->pitchUV;}
  return p->pitch;}

AVSC_INLINE int avs_get_row_size(const AVS_VideoFrame * p) {
        return p->row_size; }

AVSC_INLINE int avs_get_row_size_p(const AVS_VideoFrame * p, int plane) { 
        int r;
    switch (plane) {
    case AVS_PLANAR_U: case AVS_PLANAR_V: 
                if (p->pitchUV) return p->row_sizeUV; 
                else            return 0;
    case AVS_PLANAR_U_ALIGNED: case AVS_PLANAR_V_ALIGNED: 
                if (p->pitchUV) { 
                        r = (p->row_sizeUV+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)); // Aligned rowsize
                        if (r < p->pitchUV) 
                                return r; 
                        return p->row_sizeUV;
                } else return 0;
    case AVS_PLANAR_Y_ALIGNED:
                r = (p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)); // Aligned rowsize
                if (r <= p->pitch) 
                        return r; 
                return p->row_size;
    }
    return p->row_size;
}

AVSC_INLINE int avs_get_height(const AVS_VideoFrame * p) {
        return p->height;}

AVSC_INLINE int avs_get_height_p(const AVS_VideoFrame * p, int plane) {
        switch (plane) {
                case AVS_PLANAR_U: case AVS_PLANAR_V: 
                        if (p->pitchUV) return p->heightUV;
                        return 0;
        }
        return p->height;}

AVSC_INLINE const BYTE* avs_get_read_ptr(const AVS_VideoFrame * p) {
        return p->vfb->data + p->offset;}

AVSC_INLINE const BYTE* avs_get_read_ptr_p(const AVS_VideoFrame * p, int plane) 
{
        switch (plane) {
                case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
                case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
                default:           return p->vfb->data + p->offset;}
}

AVSC_INLINE int avs_is_writable(const AVS_VideoFrame * p) {
        return (p->refcount == 1 && p->vfb->refcount == 1);}

AVSC_INLINE BYTE* avs_get_write_ptr(const AVS_VideoFrame * p) 
{
        if (avs_is_writable(p)) {
                ++p->vfb->sequence_number;
                return p->vfb->data + p->offset;
        } else
                return 0;
}

AVSC_INLINE BYTE* avs_get_write_ptr_p(const AVS_VideoFrame * p, int plane) 
{
        if (plane==AVS_PLANAR_Y && avs_is_writable(p)) {
                ++p->vfb->sequence_number;
                return p->vfb->data + p->offset;
        } else if (plane==AVS_PLANAR_Y) {
                return 0;
        } else {
                switch (plane) {
                        case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
                        case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
                        default:       return p->vfb->data + p->offset;
                }
        }
}


AVSC_API(void, avs_release_video_frame)(AVS_VideoFrame *);
// makes a shallow copy of a video frame
AVSC_API(AVS_VideoFrame *, avs_copy_video_frame)(AVS_VideoFrame *);

#ifndef AVSC_NO_DECLSPEC
AVSC_INLINE void avs_release_frame(AVS_VideoFrame * f)
  {avs_release_video_frame(f);}
AVSC_INLINE AVS_VideoFrame * avs_copy_frame(AVS_VideoFrame * f)
  {return avs_copy_video_frame(f);}
#endif

/////////////////////////////////////////////////////////////////////
//
// AVS_Value
//

// Treat AVS_Value as a fat pointer.  That is use avs_copy_value
// and avs_release_value appropiaty as you would if AVS_Value was
// a pointer.

// To maintain source code compatibility with future versions of the
// avisynth_c API don't use the AVS_Value directly.  Use the helper
// functions below.

// AVS_Value is layed out identicly to AVSValue
typedef struct AVS_Value AVS_Value;
struct AVS_Value {
  short type;  // 'a'rray, 'c'lip, 'b'ool, 'i'nt, 'f'loat, 's'tring, 'v'oid, or 'l'ong
               // for some function e'rror
  short array_size;
  union {
    void * clip; // do not use directly, use avs_take_clip
    char boolean;
    int integer;
    float floating_pt;
    const char * string;
    const AVS_Value * array;
  } d;
};

// AVS_Value should be initilized with avs_void.
// Should also set to avs_void after the value is released
// with avs_copy_value.  Consider it the equalvent of setting
// a pointer to NULL
static const AVS_Value avs_void = {'v'};

AVSC_API(void, avs_copy_value)(AVS_Value * dest, AVS_Value src);
AVSC_API(void, avs_release_value)(AVS_Value);

AVSC_INLINE int avs_defined(AVS_Value v) { return v.type != 'v'; }
AVSC_INLINE int avs_is_clip(AVS_Value v) { return v.type == 'c'; }
AVSC_INLINE int avs_is_bool(AVS_Value v) { return v.type == 'b'; }
AVSC_INLINE int avs_is_int(AVS_Value v) { return v.type == 'i'; }
AVSC_INLINE int avs_is_float(AVS_Value v) { return v.type == 'f' || v.type == 'i'; }
AVSC_INLINE int avs_is_string(AVS_Value v) { return v.type == 's'; }
AVSC_INLINE int avs_is_array(AVS_Value v) { return v.type == 'a'; }
AVSC_INLINE int avs_is_error(AVS_Value v) { return v.type == 'e'; }

AVSC_API(AVS_Clip *, avs_take_clip)(AVS_Value, AVS_ScriptEnvironment *);
AVSC_API(void, avs_set_to_clip)(AVS_Value *, AVS_Clip *);

AVSC_INLINE int avs_as_bool(AVS_Value v) 
        { return v.d.boolean; }   
AVSC_INLINE int avs_as_int(AVS_Value v) 
        { return v.d.integer; }   
AVSC_INLINE const char * avs_as_string(AVS_Value v) 
        { return avs_is_error(v) || avs_is_string(v) ? v.d.string : 0; }
AVSC_INLINE double avs_as_float(AVS_Value v) 
        { return avs_is_int(v) ? v.d.integer : v.d.floating_pt; }
AVSC_INLINE const char * avs_as_error(AVS_Value v) 
        { return avs_is_error(v) ? v.d.string : 0; }
AVSC_INLINE const AVS_Value * avs_as_array(AVS_Value v)
        { return v.d.array; }
AVSC_INLINE int avs_array_size(AVS_Value v) 
        { return avs_is_array(v) ? v.array_size : 1; }
AVSC_INLINE AVS_Value avs_array_elt(AVS_Value v, int index) 
        { return avs_is_array(v) ? v.d.array[index] : v; }

// only use these functions on an AVS_Value that does not already have
// an active value.  Remember, treat AVS_Value as a fat pointer.
AVSC_INLINE AVS_Value avs_new_value_bool(int v0) 
        { AVS_Value v; v.type = 'b'; v.d.boolean = v0 == 0 ? 0 : 1; return v; }   
AVSC_INLINE AVS_Value avs_new_value_int(int v0) 
        { AVS_Value v; v.type = 'i'; v.d.integer = v0; return v; }   
AVSC_INLINE AVS_Value avs_new_value_string(const char * v0) 
        { AVS_Value v; v.type = 's'; v.d.string = v0; return v; }
AVSC_INLINE AVS_Value avs_new_value_float(float v0) 
        { AVS_Value v; v.type = 'f'; v.d.floating_pt = v0; return v;}
AVSC_INLINE AVS_Value avs_new_value_error(const char * v0) 
        { AVS_Value v; v.type = 'e'; v.d.string = v0; return v; }
#ifndef AVSC_NO_DECLSPEC
AVSC_INLINE AVS_Value avs_new_value_clip(AVS_Clip * v0)
        { AVS_Value v; avs_set_to_clip(&v, v0); return v; }
#endif
AVSC_INLINE AVS_Value avs_new_value_array(AVS_Value * v0, int size)
        { AVS_Value v; v.type = 'a'; v.d.array = v0; v.array_size = size; return v; }

/////////////////////////////////////////////////////////////////////
//
// AVS_Clip
//

AVSC_API(void, avs_release_clip)(AVS_Clip *);
AVSC_API(AVS_Clip *, avs_copy_clip)(AVS_Clip *);

AVSC_API(const char *, avs_clip_get_error)(AVS_Clip *); // return 0 if no error

AVSC_API(const AVS_VideoInfo *, avs_get_video_info)(AVS_Clip *);

AVSC_API(int, avs_get_version)(AVS_Clip *);
 
AVSC_API(AVS_VideoFrame *, avs_get_frame)(AVS_Clip *, int n);
// The returned video frame must be released with avs_release_video_frame

AVSC_API(int, avs_get_parity)(AVS_Clip *, int n); 
// return field parity if field_based, else parity of first field in frame

AVSC_API(int, avs_get_audio)(AVS_Clip *, void * buf, 
                             INT64 start, INT64 count); 
// start and count are in samples

AVSC_API(int, avs_set_cache_hints)(AVS_Clip *, 
                                   int cachehints, int frame_range);

// This is the callback type used by avs_add_function
typedef AVS_Value (AVSC_CC * AVS_ApplyFunc)
                        (AVS_ScriptEnvironment *, AVS_Value args, void * user_data);

typedef struct AVS_FilterInfo AVS_FilterInfo;
struct AVS_FilterInfo
{
  // these members should not be modified outside of the AVS_ApplyFunc callback
  AVS_Clip * child;
  AVS_VideoInfo vi;
  AVS_ScriptEnvironment * env;
  AVS_VideoFrame * (AVSC_CC * get_frame)(AVS_FilterInfo *, int n);
  int (AVSC_CC * get_parity)(AVS_FilterInfo *, int n);
  int (AVSC_CC * get_audio)(AVS_FilterInfo *, void * buf, 
                                  INT64 start, INT64 count);
  int (AVSC_CC * set_cache_hints)(AVS_FilterInfo *, int cachehints, 
                                        int frame_range);
  void (AVSC_CC * free_filter)(AVS_FilterInfo *);
  
  // Should be set when ever there is an error to report.
  // It is cleared before any of the above methods are called
  const char * error;
  // this is to store whatever and may be modified at will
  void * user_data;
};

// Create a new filter
// fi is set to point to the AVS_FilterInfo so that you can
//   modify it once it is initilized.
// store_child should generally be set to true.  If it is not
//    set than ALL methods (the function pointers) must be defined
// If it is set than you do not need to worry about freeing the child
//    clip.
AVSC_API(AVS_Clip *, avs_new_c_filter)(AVS_ScriptEnvironment * e,
                                       AVS_FilterInfo * * fi,
                                       AVS_Value child, int store_child);

/////////////////////////////////////////////////////////////////////
//
// AVS_ScriptEnvironment
//

// For GetCPUFlags.  These are backwards-compatible with those in VirtualDub.
enum {                    
                                /* slowest CPU to support extension */
  AVS_CPU_FORCE        = 0x01,   // N/A
  AVS_CPU_FPU          = 0x02,   // 386/486DX
  AVS_CPU_MMX          = 0x04,   // P55C, K6, PII
  AVS_CPU_INTEGER_SSE  = 0x08,   // PIII, Athlon
  AVS_CPU_SSE          = 0x10,   // PIII, Athlon XP/MP
  AVS_CPU_SSE2         = 0x20,   // PIV, Hammer
  AVS_CPU_3DNOW        = 0x40,   // K6-2
  AVS_CPU_3DNOW_EXT    = 0x80,   // Athlon
  AVS_CPU_X86_64       = 0xA0,   // Hammer (note: equiv. to 3DNow + SSE2, 
                                 // which only Hammer will have anyway)
  AVS_CPUF_SSE3       = 0x100,   //  PIV+, K8 Venice
  AVS_CPUF_SSSE3      = 0x200,   //  Core 2
  AVS_CPUF_SSE4       = 0x400,   //  Penryn, Wolfdale, Yorkfield
  AVS_CPUF_SSE4_1     = 0x400,
  AVS_CPUF_SSE4_2     = 0x800,   //  Nehalem
};

AVSC_API(const char *, avs_get_error)(AVS_ScriptEnvironment *); // return 0 if no error

AVSC_API(long, avs_get_cpu_flags)(AVS_ScriptEnvironment *);
AVSC_API(int, avs_check_version)(AVS_ScriptEnvironment *, int version);

AVSC_API(char *, avs_save_string)(AVS_ScriptEnvironment *, const char* s, int length);
AVSC_API(char *, avs_sprintf)(AVS_ScriptEnvironment *, const char * fmt, ...);

AVSC_API(char *, avs_vsprintf)(AVS_ScriptEnvironment *, const char * fmt, void* val);
 // note: val is really a va_list; I hope everyone typedefs va_list to a pointer

AVSC_API(int, avs_add_function)(AVS_ScriptEnvironment *, 
                                const char * name, const char * params, 
                                AVS_ApplyFunc apply, void * user_data);

AVSC_API(int, avs_function_exists)(AVS_ScriptEnvironment *, const char * name);

AVSC_API(AVS_Value, avs_invoke)(AVS_ScriptEnvironment *, const char * name, 
                               AVS_Value args, const char** arg_names);
// The returned value must be be released with avs_release_value

AVSC_API(AVS_Value, avs_get_var)(AVS_ScriptEnvironment *, const char* name);
// The returned value must be be released with avs_release_value

AVSC_API(int, avs_set_var)(AVS_ScriptEnvironment *, const char* name, AVS_Value val);

AVSC_API(int, avs_set_global_var)(AVS_ScriptEnvironment *, const char* name, const AVS_Value val);

//void avs_push_context(AVS_ScriptEnvironment *, int level=0);
//void avs_pop_context(AVS_ScriptEnvironment *);

AVSC_API(AVS_VideoFrame *, avs_new_video_frame_a)(AVS_ScriptEnvironment *, 
                                          const AVS_VideoInfo * vi, int align);
// align should be at least 16

#ifndef AVSC_NO_DECLSPEC
AVSC_INLINE 
AVS_VideoFrame * avs_new_video_frame(AVS_ScriptEnvironment * env, 
                                     const AVS_VideoInfo * vi)
  {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}

AVSC_INLINE 
AVS_VideoFrame * avs_new_frame(AVS_ScriptEnvironment * env, 
                               const AVS_VideoInfo * vi)
  {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
#endif


AVSC_API(int, avs_make_writable)(AVS_ScriptEnvironment *, AVS_VideoFrame * * pvf);

AVSC_API(void, avs_bit_blt)(AVS_ScriptEnvironment *, BYTE* dstp, int dst_pitch, const BYTE* srcp, int src_pitch, int row_size, int height);

typedef void (AVSC_CC *AVS_ShutdownFunc)(void* user_data, AVS_ScriptEnvironment * env);
AVSC_API(void, avs_at_exit)(AVS_ScriptEnvironment *, AVS_ShutdownFunc function, void * user_data);

AVSC_API(AVS_VideoFrame *, avs_subframe)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height);
// The returned video frame must be be released

AVSC_API(int, avs_set_memory_max)(AVS_ScriptEnvironment *, int mem);

AVSC_API(int, avs_set_working_dir)(AVS_ScriptEnvironment *, const char * newdir);

// avisynth.dll exports this; it's a way to use it as a library, without
// writing an AVS script or without going through AVIFile.
AVSC_API(AVS_ScriptEnvironment *, avs_create_script_environment)(int version);

// this symbol is the entry point for the plugin and must
// be defined
AVSC_EXPORT
const char * AVSC_CC avisynth_c_plugin_init(AVS_ScriptEnvironment* env);


AVSC_API(void, avs_delete_script_environment)(AVS_ScriptEnvironment *);


AVSC_API(AVS_VideoFrame *, avs_subframe_planar)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV);
// The returned video frame must be be released

#ifdef AVSC_NO_DECLSPEC
// use LoadLibrary and related functions to dynamically load Avisynth instead of declspec(dllimport)
/*
  The following functions needs to have been declared, probably from windows.h

  void* malloc(size_t)
  void free(void*);

  HMODULE LoadLibrary(const char*);
  void* GetProcAddress(HMODULE, const char*);
  FreeLibrary(HMODULE);
*/


typedef struct AVS_Library AVS_Library;

#define AVSC_DECLARE_FUNC(name) name##_func name

struct AVS_Library {
  HMODULE handle;

  AVSC_DECLARE_FUNC(avs_add_function);
  AVSC_DECLARE_FUNC(avs_at_exit);
  AVSC_DECLARE_FUNC(avs_bit_blt);
  AVSC_DECLARE_FUNC(avs_check_version);
  AVSC_DECLARE_FUNC(avs_clip_get_error);
  AVSC_DECLARE_FUNC(avs_copy_clip);
  AVSC_DECLARE_FUNC(avs_copy_value);
  AVSC_DECLARE_FUNC(avs_copy_video_frame);
  AVSC_DECLARE_FUNC(avs_create_script_environment);
  AVSC_DECLARE_FUNC(avs_delete_script_environment);
  AVSC_DECLARE_FUNC(avs_function_exists);
  AVSC_DECLARE_FUNC(avs_get_audio);
  AVSC_DECLARE_FUNC(avs_get_cpu_flags);
  AVSC_DECLARE_FUNC(avs_get_error);
  AVSC_DECLARE_FUNC(avs_get_frame);
  AVSC_DECLARE_FUNC(avs_get_parity);
  AVSC_DECLARE_FUNC(avs_get_var);
  AVSC_DECLARE_FUNC(avs_get_version);
  AVSC_DECLARE_FUNC(avs_get_video_info);
  AVSC_DECLARE_FUNC(avs_invoke);
  AVSC_DECLARE_FUNC(avs_make_writable);
  AVSC_DECLARE_FUNC(avs_new_c_filter);
  AVSC_DECLARE_FUNC(avs_new_video_frame_a);
  AVSC_DECLARE_FUNC(avs_release_clip);
  AVSC_DECLARE_FUNC(avs_release_value);
  AVSC_DECLARE_FUNC(avs_release_video_frame);
  AVSC_DECLARE_FUNC(avs_save_string);
  AVSC_DECLARE_FUNC(avs_set_cache_hints);
  AVSC_DECLARE_FUNC(avs_set_global_var);
  AVSC_DECLARE_FUNC(avs_set_memory_max);
  AVSC_DECLARE_FUNC(avs_set_to_clip);
  AVSC_DECLARE_FUNC(avs_set_var);
  AVSC_DECLARE_FUNC(avs_set_working_dir);
  AVSC_DECLARE_FUNC(avs_sprintf);
  AVSC_DECLARE_FUNC(avs_subframe);
  AVSC_DECLARE_FUNC(avs_subframe_planar);
  AVSC_DECLARE_FUNC(avs_take_clip);
  AVSC_DECLARE_FUNC(avs_vsprintf);
};

#undef AVSC_DECLARE_FUNC


AVSC_INLINE AVS_Library * avs_load_library() {
  AVS_Library *library = (AVS_Library *)malloc(sizeof(AVS_Library));
  if (library == NULL)
    return NULL;
  library->handle = LoadLibrary("avisynth");
  if (library->handle == NULL)
    goto fail;

#define __AVSC_STRINGIFY(x) #x
#define AVSC_STRINGIFY(x) __AVSC_STRINGIFY(x)
#define AVSC_LOAD_FUNC(name) {\
  library->name = (name##_func) GetProcAddress(library->handle, AVSC_STRINGIFY(name));\
  if (library->name == NULL)\
    goto fail;\
}

  AVSC_LOAD_FUNC(avs_add_function);
  AVSC_LOAD_FUNC(avs_at_exit);
  AVSC_LOAD_FUNC(avs_bit_blt);
  AVSC_LOAD_FUNC(avs_check_version);
  AVSC_LOAD_FUNC(avs_clip_get_error);
  AVSC_LOAD_FUNC(avs_copy_clip);
  AVSC_LOAD_FUNC(avs_copy_value);
  AVSC_LOAD_FUNC(avs_copy_video_frame);
  AVSC_LOAD_FUNC(avs_create_script_environment);
  AVSC_LOAD_FUNC(avs_delete_script_environment);
  AVSC_LOAD_FUNC(avs_function_exists);
  AVSC_LOAD_FUNC(avs_get_audio);
  AVSC_LOAD_FUNC(avs_get_cpu_flags);
  AVSC_LOAD_FUNC(avs_get_error);
  AVSC_LOAD_FUNC(avs_get_frame);
  AVSC_LOAD_FUNC(avs_get_parity);
  AVSC_LOAD_FUNC(avs_get_var);
  AVSC_LOAD_FUNC(avs_get_version);
  AVSC_LOAD_FUNC(avs_get_video_info);
  AVSC_LOAD_FUNC(avs_invoke);
  AVSC_LOAD_FUNC(avs_make_writable);
  AVSC_LOAD_FUNC(avs_new_c_filter);
  AVSC_LOAD_FUNC(avs_new_video_frame_a);
  AVSC_LOAD_FUNC(avs_release_clip);
  AVSC_LOAD_FUNC(avs_release_value);
  AVSC_LOAD_FUNC(avs_release_video_frame);
  AVSC_LOAD_FUNC(avs_save_string);
  AVSC_LOAD_FUNC(avs_set_cache_hints);
  AVSC_LOAD_FUNC(avs_set_global_var);
  AVSC_LOAD_FUNC(avs_set_memory_max);
  AVSC_LOAD_FUNC(avs_set_to_clip);
  AVSC_LOAD_FUNC(avs_set_var);
  AVSC_LOAD_FUNC(avs_set_working_dir);
  AVSC_LOAD_FUNC(avs_sprintf);
  AVSC_LOAD_FUNC(avs_subframe);
  AVSC_LOAD_FUNC(avs_subframe_planar);
  AVSC_LOAD_FUNC(avs_take_clip);
  AVSC_LOAD_FUNC(avs_vsprintf);

#undef __AVSC_STRINGIFY
#undef AVSC_STRINGIFY
#undef AVSC_LOAD_FUNC

  return library;

fail:
  free(library);
  return NULL;
}

AVSC_INLINE void avs_free_library(AVS_Library *library) {
  if (library == NULL)
    return;
  FreeLibrary(library->handle);
  free(library);
}
#endif

#endif
x264-snapshot-20120103-2245-stable/encoder/0000755000175000001440000000000011700673342017310 5ustar  videolanusersx264-snapshot-20120103-2245-stable/encoder/rdo.c0000644000175000001440000011125511700673342020245 0ustar  videolanusers/*****************************************************************************
 * rdo.c: rate-distortion optimization
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

/* duplicate all the writer functions, just calculating bit cost
 * instead of writing the bitstream.
 * TODO: use these for fast 1st pass too. */

#define RDO_SKIP_BS 1

/* Transition and size tables for abs<9 MVD and residual coding */
/* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
static uint8_t cabac_transition_unary[15][128];
static uint16_t cabac_size_unary[15][128];
/* Transition and size tables for abs>9 MVD */
/* Consist of 5 1s and a bypass sign bit */
static uint8_t cabac_transition_5ones[128];
static uint16_t cabac_size_5ones[128];

/* CAVLC: produces exactly the same bit count as a normal encode */
/* this probably still leaves some unnecessary computations */
#define bs_write1(s,v)     ((s)->i_bits_encoded += 1)
#define bs_write(s,n,v)    ((s)->i_bits_encoded += (n))
#define bs_write_ue(s,v)   ((s)->i_bits_encoded += bs_size_ue(v))
#define bs_write_se(s,v)   ((s)->i_bits_encoded += bs_size_se(v))
#define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
#define x264_macroblock_write_cavlc  static x264_macroblock_size_cavlc
#include "cavlc.c"

/* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
 * fractional bits, but only finite precision. */
#undef  x264_cabac_encode_decision
#undef  x264_cabac_encode_decision_noup
#undef  x264_cabac_encode_bypass
#undef  x264_cabac_encode_terminal
#define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
#define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
#define x264_cabac_encode_terminal(c)     ((c)->f8_bits_encoded += 7)
#define x264_cabac_encode_bypass(c,v)     ((c)->f8_bits_encoded += 256)
#define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
#define x264_macroblock_write_cabac  static x264_macroblock_size_cabac
#include "cabac.c"

#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
        sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) )
#define COPY_CABAC_PART( pos, size )\
        memcpy( &cb->state[pos], &h->cabac.state[pos], size )

static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y )
{
    static const uint8_t hadamard_shift_x[4] = {4,   4,   3,   3};
    static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
    static const uint8_t  hadamard_offset[4] = {0,   1,   3,   5};
    int cache_index = (x >> hadamard_shift_x[size]) + (y >> hadamard_shift_y[size])
                    + hadamard_offset[size];
    uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
    if( res )
        return res - 1;
    else
    {
        pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
        res = h->pixf.hadamard_ac[size]( fenc, FENC_STRIDE );
        h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
        return res;
    }
}

static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
{
    static const uint8_t satd_shift_x[3] = {3,   2,   2};
    static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
    static const uint8_t  satd_offset[3] = {0,   8,   16};
    ALIGNED_16( static pixel zero[16] ) = {0};
    int cache_index = (x >> satd_shift_x[size - PIXEL_8x4]) + (y >> satd_shift_y[size - PIXEL_8x4])
                    + satd_offset[size - PIXEL_8x4];
    int res = h->mb.pic.fenc_satd_cache[cache_index];
    if( res )
        return res - 1;
    else
    {
        pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
        int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
        res = h->pixf.satd[size]( fenc, FENC_STRIDE, zero, 0 ) - dc;
        h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
        return res;
    }
}

/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
/* SATD and SA8D are used to measure block complexity. */
/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size.  Using SATD */
/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */

/* FIXME:  Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
/* This optimization can also be used in non-RD transform decision. */

static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
{
    ALIGNED_16( static pixel zero[16] ) = {0};
    int satd = 0;
    pixel *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
    pixel *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
    if( p == 0 && h->mb.i_psy_rd )
    {
        /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
        if( size <= PIXEL_8x8 )
        {
            uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
            uint64_t fenc_acs = cached_hadamard( h, size, x, y );
            satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs)
                 + abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32));
            satd >>= 1;
        }
        else
        {
            int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
        }
        satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
    }
    return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
}

static inline int ssd_mb( x264_t *h )
{
    int chroma_size = h->luma2chroma_pixel[PIXEL_16x16];
    int chroma_ssd = ssd_plane(h, chroma_size, 1, 0, 0) + ssd_plane(h, chroma_size, 2, 0, 0);
    chroma_ssd = ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
    return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chroma_ssd;
}

static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
{
    int b_transform_bak = h->mb.b_transform_8x8;
    int i_ssd;
    int i_bits;
    int type_bak = h->mb.i_type;

    x264_macroblock_encode( h );

    if( h->mb.b_deblock_rdo )
        x264_macroblock_deblock( h );

    i_ssd = ssd_mb( h );

    if( IS_SKIP( h->mb.i_type ) )
    {
        i_bits = (1 * i_lambda2 + 128) >> 8;
    }
    else if( h->param.b_cabac )
    {
        x264_cabac_t cabac_tmp;
        COPY_CABAC;
        x264_macroblock_size_cabac( h, &cabac_tmp );
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
    }
    else
    {
        x264_macroblock_size_cavlc( h );
        i_bits = ( h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
    }

    h->mb.b_transform_8x8 = b_transform_bak;
    h->mb.i_type = type_bak;

    return i_ssd + i_bits;
}

/* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */

static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
{
    uint64_t i_ssd, i_bits;

    x264_macroblock_encode_p4x4( h, i4 );
    if( i_pixel == PIXEL_8x4 )
        x264_macroblock_encode_p4x4( h, i4+1 );
    if( i_pixel == PIXEL_4x8 )
        x264_macroblock_encode_p4x4( h, i4+2 );

    i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
    if( CHROMA444 )
    {
        int chromassd = ssd_plane( h, i_pixel, 1, block_idx_x[i4]*4, block_idx_y[i4]*4 )
                      + ssd_plane( h, i_pixel, 2, block_idx_x[i4]*4, block_idx_y[i4]*4 );
        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
        i_ssd += chromassd;
    }

    if( h->param.b_cabac )
    {
        x264_cabac_t cabac_tmp;
        COPY_CABAC;
        x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
    }
    else
        i_bits = x264_subpartition_size_cavlc( h, i4, i_pixel );

    return (i_ssd<<8) + i_bits;
}

uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
{
    uint64_t i_ssd, i_bits;
    int i8 = i4 >> 2;

    if( i_pixel == PIXEL_16x16 )
    {
        int i_cost = x264_rd_cost_mb( h, i_lambda2 );
        return i_cost;
    }

    if( i_pixel > PIXEL_8x8 )
        return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );

    h->mb.i_cbp_luma = 0;

    x264_macroblock_encode_p8x8( h, i8 );
    if( i_pixel == PIXEL_16x8 )
        x264_macroblock_encode_p8x8( h, i8+1 );
    if( i_pixel == PIXEL_8x16 )
        x264_macroblock_encode_p8x8( h, i8+2 );

    int ssd_x = 8*(i8&1);
    int ssd_y = 8*(i8>>1);
    i_ssd = ssd_plane( h, i_pixel, 0, ssd_x, ssd_y );
    int chromapix = h->luma2chroma_pixel[i_pixel];
    int chromassd = ssd_plane( h, chromapix, 1, ssd_x>>CHROMA_H_SHIFT, ssd_y>>CHROMA_V_SHIFT )
                  + ssd_plane( h, chromapix, 2, ssd_x>>CHROMA_H_SHIFT, ssd_y>>CHROMA_V_SHIFT );
    i_ssd += ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;

    if( h->param.b_cabac )
    {
        x264_cabac_t cabac_tmp;
        COPY_CABAC;
        x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
    }
    else
        i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;

    return (i_ssd<<8) + i_bits;
}

static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, pixel edge[4][32] )
{
    uint64_t i_ssd, i_bits;
    int plane_count = CHROMA444 ? 3 : 1;
    int i_qp = h->mb.i_qp;
    h->mb.i_cbp_luma &= ~(1<<i8);
    h->mb.b_transform_8x8 = 1;

    for( int p = 0; p < plane_count; p++ )
    {
        x264_mb_encode_i8x8( h, p, i8, i_qp, i_mode, edge[p], 1 );
        i_qp = h->mb.i_chroma_qp;
    }

    i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
    if( CHROMA444 )
    {
        int chromassd = ssd_plane( h, PIXEL_8x8, 1, (i8&1)*8, (i8>>1)*8 )
                      + ssd_plane( h, PIXEL_8x8, 2, (i8&1)*8, (i8>>1)*8 );
        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
        i_ssd += chromassd;
    }

    if( h->param.b_cabac )
    {
        x264_cabac_t cabac_tmp;
        COPY_CABAC;
        x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
    }
    else
        i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;

    return (i_ssd<<8) + i_bits;
}

static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
{
    uint64_t i_ssd, i_bits;
    int plane_count = CHROMA444 ? 3 : 1;
    int i_qp = h->mb.i_qp;

    for( int p = 0; p < plane_count; p++ )
    {
        x264_mb_encode_i4x4( h, p, i4, i_qp, i_mode, 1 );
        i_qp = h->mb.i_chroma_qp;
    }

    i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
    if( CHROMA444 )
    {
        int chromassd = ssd_plane( h, PIXEL_4x4, 1, block_idx_x[i4]*4, block_idx_y[i4]*4 )
                      + ssd_plane( h, PIXEL_4x4, 2, block_idx_x[i4]*4, block_idx_y[i4]*4 );
        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
        i_ssd += chromassd;
    }

    if( h->param.b_cabac )
    {
        x264_cabac_t cabac_tmp;
        COPY_CABAC;
        x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
    }
    else
        i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;

    return (i_ssd<<8) + i_bits;
}

static uint64_t x264_rd_cost_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
{
    uint64_t i_ssd, i_bits;

    if( b_dct )
        x264_mb_encode_chroma( h, 0, h->mb.i_chroma_qp );

    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
    i_ssd = ssd_plane( h, chromapix, 1, 0, 0 )
          + ssd_plane( h, chromapix, 2, 0, 0 );

    h->mb.i_chroma_pred_mode = i_mode;

    if( h->param.b_cabac )
    {
        x264_cabac_t cabac_tmp;
        COPY_CABAC;
        x264_chroma_size_cabac( h, &cabac_tmp );
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
    }
    else
        i_bits = x264_chroma_size_cavlc( h ) * i_lambda2;

    return (i_ssd<<8) + i_bits;
}
/****************************************************************************
 * Trellis RD quantization
 ****************************************************************************/

#define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
#define CABAC_SIZE_BITS 8
#define SSD_WEIGHT_BITS 5
#define LAMBDA_BITS 4

/* precalculate the cost of coding various combinations of bits in a single context */
void x264_rdo_init( void )
{
    for( int i_prefix = 0; i_prefix < 15; i_prefix++ )
    {
        for( int i_ctx = 0; i_ctx < 128; i_ctx++ )
        {
            int f8_bits = 0;
            uint8_t ctx = i_ctx;

            for( int i = 1; i < i_prefix; i++ )
                f8_bits += x264_cabac_size_decision2( &ctx, 1 );
            if( i_prefix > 0 && i_prefix < 14 )
                f8_bits += x264_cabac_size_decision2( &ctx, 0 );
            f8_bits += 1 << CABAC_SIZE_BITS; //sign

            cabac_size_unary[i_prefix][i_ctx] = f8_bits;
            cabac_transition_unary[i_prefix][i_ctx] = ctx;
        }
    }
    for( int i_ctx = 0; i_ctx < 128; i_ctx++ )
    {
        int f8_bits = 0;
        uint8_t ctx = i_ctx;

        for( int i = 0; i < 5; i++ )
            f8_bits += x264_cabac_size_decision2( &ctx, 1 );
        f8_bits += 1 << CABAC_SIZE_BITS; //sign

        cabac_size_5ones[i_ctx] = f8_bits;
        cabac_transition_5ones[i_ctx] = ctx;
    }
}

typedef struct
{
    int64_t score;
    int level_idx; // index into level_tree[]
    uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
} trellis_node_t;

// TODO:
// save cabac state between blocks?
// use trellis' RD score instead of x264_mb_decimate_score?
// code 8x8 sig/last flags forwards with deadzone and save the contexts at
//   each position?
// change weights when using CQMs?

// possible optimizations:
// make scores fit in 32bit
// save quantized coefs during rd, to avoid a duplicate trellis in the final encode
// if trellissing all MBRD modes, finish SSD calculation so we can skip all of
//   the normal dequant/idct/ssd/cabac

// the unquant_mf here is not the same as dequant_mf:
// in normal operation (dct->quant->dequant->idct) the dct and idct are not
// normalized. quant/dequant absorb those scaling factors.
// in this function, we just do (quant->unquant) and want the output to be
// comparable to the input. so unquant is the direct inverse of quant,
// and uses the dct scaling factors, not the idct ones.

static ALWAYS_INLINE
int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                         const udctcoef *quant_mf, const int *unquant_mf,
                         const uint16_t *coef_weight, const uint8_t *zigzag,
                         int ctx_block_cat, int i_lambda2, int b_ac,
                         int b_chroma, int dc, int i_coefs, int idx )
{
    udctcoef abs_coefs[64];
    int8_t signs[64];
    trellis_node_t nodes[2][8];
    trellis_node_t *nodes_cur = nodes[0];
    trellis_node_t *nodes_prev = nodes[1];
    trellis_node_t *bnode;
    const int b_interlaced = MB_INTERLACED;
    uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
    uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
    const uint8_t *levelgt1_ctx = b_chroma && dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
    const int f = 1 << 15; // no deadzone
    int i_last_nnz;
    int i;

    // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
    // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
    // but it takes more time to remove dead states than you gain in reduced memory.
    struct
    {
        uint16_t abs_level;
        uint16_t next;
    } level_tree[64*8*2];
    int i_levels_used = 1;

    /* init coefs */
    for( i = i_coefs-1; i >= b_ac; i-- )
        if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
            break;

    if( i < b_ac )
    {
        /* We only need to zero an empty 4x4 block. 8x8 can be
           implicitly emptied via zero nnz, as can dc. */
        if( i_coefs == 16 && !dc )
            memset( dct, 0, 16 * sizeof(dctcoef) );
        return 0;
    }

    i_last_nnz = i;
    idx &= i_coefs == 64 ? 3 : 15;

    for( ; i >= b_ac; i-- )
    {
        int coef = dct[zigzag[i]];
        abs_coefs[i] = abs(coef);
        signs[i] = coef>>31 | 1;
    }

    /* init trellis */
    for( int j = 1; j < 8; j++ )
        nodes_cur[j].score = TRELLIS_SCORE_MAX;
    nodes_cur[0].score = 0;
    nodes_cur[0].level_idx = 0;
    level_tree[0].abs_level = 0;
    level_tree[0].next = 0;

    // coefs are processed in reverse order, because that's how the abs value is coded.
    // last_coef and significant_coef flags are normally coded in forward order, but
    // we have to reverse them to match the levels.
    // in 4x4 blocks, last_coef and significant_coef use a separate context for each
    // position, so the order doesn't matter, and we don't even have to update their contexts.
    // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
    // cabac isn't too sensitive.

    memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ], 10 );

    for( i = i_last_nnz; i >= b_ac; i-- )
    {
        int i_coef = abs_coefs[i];
        int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16;
        int cost_sig[2], cost_last[2];
        trellis_node_t n;

        // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
        if( q == 0 )
        {
            // no need to calculate ssd of 0s: it's the same in all nodes.
            // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
            int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
                           b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
            const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
                                     * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
            for( int j = 1; j < 8; j++ )
            {
                if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
                {
#define SET_LEVEL(n,l) \
                    level_tree[i_levels_used].abs_level = l; \
                    level_tree[i_levels_used].next = n.level_idx; \
                    n.level_idx = i_levels_used; \
                    i_levels_used++;

                    SET_LEVEL( nodes_cur[j], 0 );
                    nodes_cur[j].score += cost_sig0;
                }
            }
            continue;
        }

        XCHG( trellis_node_t*, nodes_cur, nodes_prev );

        for( int j = 0; j < 8; j++ )
            nodes_cur[j].score = TRELLIS_SCORE_MAX;

        if( i < i_coefs-1 )
        {
            int sigindex  = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
                            b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
            int lastindex = !dc && i_coefs == 64 ? last_coeff_flag_offset_8x8[i] :
                            b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
            cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
            cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
            cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 );
            cost_last[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 );
        }
        else
        {
            cost_sig[0] = cost_sig[1] = 0;
            cost_last[0] = cost_last[1] = 0;
        }

        // there are a few cases where increasing the coeff magnitude helps,
        // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
        // could also try q-2: that sometimes helps, but also sometimes decimates blocks
        // that are better left coded, especially at QP > 40.
        for( int abs_level = q; abs_level >= q-1; abs_level-- )
        {
            int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);
            int d = i_coef - unquant_abs_level;
            int64_t ssd;
            /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
            if( h->mb.i_psy_trellis && i && !dc && !b_chroma )
            {
                int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];
                int predicted_coef = orig_coef - i_coef * signs[i];
                int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
                int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
                ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
            }
            else
            /* FIXME: for i16x16 dc is this weight optimal? */
                ssd = (int64_t)d*d * (dc?256:coef_weight[i]);

            for( int j = 0; j < 8; j++ )
            {
                int node_ctx = j;
                if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
                    continue;
                n = nodes_prev[j];

                /* code the proposed level, and count how much entropy it would take */
                if( abs_level || node_ctx )
                {
                    unsigned f8_bits = cost_sig[ abs_level != 0 ];
                    if( abs_level )
                    {
                        const int i_prefix = X264_MIN( abs_level - 1, 14 );
                        f8_bits += cost_last[ node_ctx == 0 ];
                        f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
                        if( i_prefix > 0 )
                        {
                            uint8_t *ctx = &n.cabac_state[levelgt1_ctx[node_ctx]];
                            f8_bits += cabac_size_unary[i_prefix][*ctx];
                            *ctx = cabac_transition_unary[i_prefix][*ctx];
                            if( abs_level >= 15 )
                                f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
                            node_ctx = coeff_abs_level_transition[1][node_ctx];
                        }
                        else
                        {
                            f8_bits += 1 << CABAC_SIZE_BITS;
                            node_ctx = coeff_abs_level_transition[0][node_ctx];
                        }
                    }
                    n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
                }

                if( j || i || dc )
                    n.score += ssd;
                /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
                else
                {
                    d = i_coef * signs[0] - ((unquant_abs_level * signs[0] + 8)&~15);
                    n.score += (int64_t)d*d * coef_weight[i];
                }

                /* save the node if it's better than any existing node with the same cabac ctx */
                if( n.score < nodes_cur[node_ctx].score )
                {
                    SET_LEVEL( n, abs_level );
                    nodes_cur[node_ctx] = n;
                }
            }
        }
    }

    /* output levels from the best path through the trellis */
    bnode = &nodes_cur[0];
    for( int j = 1; j < 8; j++ )
        if( nodes_cur[j].score < bnode->score )
            bnode = &nodes_cur[j];

    if( bnode == &nodes_cur[0] )
    {
        if( i_coefs == 16 && !dc )
            memset( dct, 0, 16 * sizeof(dctcoef) );
        return 0;
    }

    int level = bnode->level_idx;
    for( i = b_ac; level; i++ )
    {
        dct[zigzag[i]] = level_tree[level].abs_level * signs[i];
        level = level_tree[level].next;
    }
    for( ; i < i_coefs; i++ )
        dct[zigzag[i]] = 0;

    return 1;
}

/* FIXME: This is a gigantic hack.  See below.
 *
 * CAVLC is much more difficult to trellis than CABAC.
 *
 * CABAC has only three states to track: significance map, last, and the
 * level state machine.
 * CAVLC, by comparison, has five: coeff_token (trailing + total),
 * total_zeroes, zero_run, and the level state machine.
 *
 * I know of no paper that has managed to design a close-to-optimal trellis
 * that covers all five of these and isn't exponential-time.  As a result, this
 * "trellis" isn't: it's just a QNS search.  Patches welcome for something better.
 * It's actually surprisingly fast, albeit not quite optimal.  It's pretty close
 * though; since CAVLC only has 2^16 possible rounding modes (assuming only two
 * roundings as options), a bruteforce search is feasible.  Testing shows
 * that this QNS is reasonably close to optimal in terms of compression.
 *
 * TODO:
 *  Don't bother changing large coefficients when it wouldn't affect bit cost
 *  (e.g. only affecting bypassed suffix bits).
 *  Don't re-run all parts of CAVLC bit cost calculation when not necessary.
 *  e.g. when changing a coefficient from one non-zero value to another in
 *  such a way that trailing ones and suffix length isn't affected. */
static ALWAYS_INLINE
int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
                         const udctcoef *quant_mf, const int *unquant_mf,
                         const uint16_t *coef_weight, const uint8_t *zigzag,
                         int ctx_block_cat, int i_lambda2, int b_ac,
                         int b_chroma, int dc, int i_coefs, int idx, int b_8x8 )
{
    ALIGNED_16( dctcoef quant_coefs[2][16] );
    ALIGNED_16( dctcoef coefs[16] ) = {0};
    int delta_distortion[16];
    int64_t score = 1ULL<<62;
    int i, j;
    const int f = 1<<15;
    int nC = b_chroma && dc ? 3 + (i_coefs>>2)
                            : ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)*16 : idx )];

    /* Code for handling 8x8dct -> 4x4dct CAVLC munging.  Input/output use a different
     * step/start/end than internal processing. */
    int step = 1;
    int start = b_ac;
    int end = i_coefs - 1;
    if( b_8x8 )
    {
        start = idx&3;
        end = 60 + start;
        step = 4;
    }
    idx &= 15;

    i_lambda2 <<= LAMBDA_BITS;

    /* Find last non-zero coefficient. */
    for( i = end; i >= start; i -= step )
        if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
            break;

    if( i < start )
        goto zeroblock;

    /* Prepare for QNS search: calculate distortion caused by each DCT coefficient
     * rounding to be searched.
     *
     * We only search two roundings (nearest and nearest-1) like in CABAC trellis,
     * so we just store the difference in distortion between them. */
    int i_last_nnz = b_8x8 ? i >> 2 : i;
    int coef_mask = 0;
    int round_mask = 0;
    for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step )
    {
        int coef = dct[zigzag[j]];
        int abs_coef = abs(coef);
        int sign = coef < 0 ? -1 : 1;
        int nearest_quant = ( f + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16;
        quant_coefs[1][i] = quant_coefs[0][i] = sign * nearest_quant;
        coefs[i] = quant_coefs[1][i];
        if( nearest_quant )
        {
            /* We initialize the trellis with a deadzone halfway between nearest rounding
             * and always-round-down.  This gives much better results than initializing to either
             * extreme.
             * FIXME: should we initialize to the deadzones used by deadzone quant? */
            int deadzone_quant = ( f/2 + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16;
            int unquant1 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-0) + 128) >> 8);
            int unquant0 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-1) + 128) >> 8);
            int d1 = abs_coef - unquant1;
            int d0 = abs_coef - unquant0;
            delta_distortion[i] = (d0*d0 - d1*d1) * (dc?256:coef_weight[j]);

            /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
            if( h->mb.i_psy_trellis && j && !dc && !b_chroma )
            {
                int orig_coef = b_8x8 ? h->mb.pic.fenc_dct8[idx>>2][zigzag[j]] : h->mb.pic.fenc_dct4[idx][zigzag[j]];
                int predicted_coef = orig_coef - coef;
                int psy_weight = b_8x8 ? x264_dct8_weight_tab[zigzag[j]] : x264_dct4_weight_tab[zigzag[j]];
                int psy_value0 = h->mb.i_psy_trellis * abs(predicted_coef + unquant0 * sign);
                int psy_value1 = h->mb.i_psy_trellis * abs(predicted_coef + unquant1 * sign);
                delta_distortion[i] += (psy_value0 - psy_value1) * psy_weight;
            }

            quant_coefs[0][i] = sign * (nearest_quant-1);
            if( deadzone_quant != nearest_quant )
                coefs[i] = quant_coefs[0][i];
            else
                round_mask |= 1 << i;
        }
        else
            delta_distortion[i] = 0;
        coef_mask |= (!!coefs[i]) << i;
    }

    /* Calculate the cost of the starting state. */
    h->out.bs.i_bits_encoded = 0;
    if( !coef_mask )
        bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );
    else
        x264_cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC );
    score = (int64_t)h->out.bs.i_bits_encoded * i_lambda2;

    /* QNS loop: pick the change that improves RD the most, apply it, repeat.
     * coef_mask and round_mask are used to simplify tracking of nonzeroness
     * and rounding modes chosen. */
    while( 1 )
    {
        int64_t iter_score = score;
        int iter_distortion_delta = 0;
        int iter_coef = -1;
        int iter_mask = coef_mask;
        int iter_round = round_mask;
        for( i = b_ac; i <= i_last_nnz; i++ )
        {
            if( !delta_distortion[i] )
                continue;

            /* Set up all the variables for this iteration. */
            int cur_round = round_mask ^ (1 << i);
            int round_change = (cur_round >> i)&1;
            int old_coef = coefs[i];
            int new_coef = quant_coefs[round_change][i];
            int cur_mask = (coef_mask&~(1 << i))|(!!new_coef << i);
            int cur_distortion_delta = delta_distortion[i] * (round_change ? -1 : 1);
            int64_t cur_score = cur_distortion_delta;
            coefs[i] = new_coef;

            /* Count up bits. */
            h->out.bs.i_bits_encoded = 0;
            if( !cur_mask )
                bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );
            else
                x264_cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC );
            cur_score += (int64_t)h->out.bs.i_bits_encoded * i_lambda2;

            coefs[i] = old_coef;
            if( cur_score < iter_score )
            {
                iter_score = cur_score;
                iter_coef = i;
                iter_mask = cur_mask;
                iter_round = cur_round;
                iter_distortion_delta = cur_distortion_delta;
            }
        }
        if( iter_coef >= 0 )
        {
            score = iter_score - iter_distortion_delta;
            coef_mask = iter_mask;
            round_mask = iter_round;
            coefs[iter_coef] = quant_coefs[((round_mask >> iter_coef)&1)][iter_coef];
            /* Don't try adjusting coefficients we've already adjusted.
             * Testing suggests this doesn't hurt results -- and sometimes actually helps. */
            delta_distortion[iter_coef] = 0;
        }
        else
            break;
    }

    if( coef_mask )
    {
        for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step )
            dct[zigzag[j]] = coefs[i];
        for( ; j <= end; j += step )
            dct[zigzag[j]] = 0;
        return 1;
    }

zeroblock:
    if( !dc )
    {
        if( b_8x8 )
            for( i = start; i <= end; i+=step )
                dct[zigzag[i]] = 0;
        else
            memset( dct, 0, 16*sizeof(dctcoef) );
    }
    return 0;
}

int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx )
{
    if( h->param.b_cabac )
        return quant_trellis_cabac( h, dct,
            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
            ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx );

    return quant_trellis_cavlc( h, dct,
        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
        DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 );
}

static const uint8_t x264_zigzag_scan2x2[4] = { 0, 1, 2, 3 };
static const uint8_t x264_zigzag_scan2x4[8] = { 0, 2, 1, 4, 6, 3, 5, 7 };

int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx )
{
    const uint8_t *zigzag;
    int num_coefs;
    int quant_cat = CQM_4IC+1 - b_intra;

    if( CHROMA_FORMAT == CHROMA_422 )
    {
        zigzag = x264_zigzag_scan2x4;
        num_coefs = 8;
    }
    else
    {
        zigzag = x264_zigzag_scan2x2;
        num_coefs = 4;
    }

    if( h->param.b_cabac )
        return quant_trellis_cabac( h, dct,
            h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
            DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx );

    return quant_trellis_cavlc( h, dct,
        h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
        DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 );
}

int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                            int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
{
    static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};
    int b_ac = ctx_ac[ctx_block_cat];
    if( h->param.b_cabac )
        return quant_trellis_cabac( h, dct,
            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
            x264_dct4_weight2_zigzag[MB_INTERLACED],
            x264_zigzag_scan4[MB_INTERLACED],
            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx );

    return quant_trellis_cavlc( h, dct,
            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
            x264_dct4_weight2_zigzag[MB_INTERLACED],
            x264_zigzag_scan4[MB_INTERLACED],
            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx, 0 );
}

int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                            int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
{
    if( h->param.b_cabac )
    {
        return quant_trellis_cabac( h, dct,
            h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
            x264_dct8_weight2_zigzag[MB_INTERLACED],
            x264_zigzag_scan8[MB_INTERLACED],
            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 64, idx );
    }

    /* 8x8 CAVLC is split into 4 4x4 blocks */
    int nzaccum = 0;
    for( int i = 0; i < 4; i++ )
    {
        int nz = quant_trellis_cavlc( h, dct,
            h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
            x264_dct8_weight2_zigzag[MB_INTERLACED],
            x264_zigzag_scan8[MB_INTERLACED],
            DCT_LUMA_4x4, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 16, idx*4+i, 1 );
        /* Set up nonzero count for future calls */
        h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;
        nzaccum |= nz;
    }
    return nzaccum;
}
x264-snapshot-20120103-2245-stable/encoder/encoder.c0000644000175000001440000043743411700673342021112 0ustar  videolanusers/*****************************************************************************
 * encoder.c: top-level encoder functions
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"

#include "set.h"
#include "analyse.h"
#include "ratecontrol.h"
#include "macroblock.h"
#include "me.h"

#if HAVE_VISUALIZE
#include "common/visualize.h"
#endif

//#define DEBUG_MB_TYPE

#define bs_write_ue bs_write_ue_big

static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
                                   x264_nal_t **pp_nal, int *pi_nal,
                                   x264_picture_t *pic_out );

/****************************************************************************
 *
 ******************************* x264 libs **********************************
 *
 ****************************************************************************/
static double x264_psnr( double sqe, double size )
{
    double mse = sqe / (PIXEL_MAX*PIXEL_MAX * size);
    if( mse <= 0.0000000001 ) /* Max 100dB */
        return 100;

    return -10.0 * log10( mse );
}

static double x264_ssim( double ssim )
{
    return -10.0 * log10( 1 - ssim );
}

static void x264_frame_dump( x264_t *h )
{
    FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
    if( !f )
        return;

    /* Write the frame in display order */
    int frame_size = FRAME_SIZE( h->param.i_height * h->param.i_width * sizeof(pixel) );
    fseek( f, (uint64_t)h->fdec->i_frame * frame_size, SEEK_SET );
    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
        for( int y = 0; y < h->param.i_height; y++ )
            fwrite( &h->fdec->plane[p][y*h->fdec->i_stride[p]], sizeof(pixel), h->param.i_width, f );
    if( !CHROMA444 )
    {
        int cw = h->param.i_width>>1;
        int ch = h->param.i_height>>CHROMA_V_SHIFT;
        pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
        pixel *planev = planeu + cw*ch + 16;
        h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
        fwrite( planeu, 1, cw*ch*sizeof(pixel), f );
        fwrite( planev, 1, cw*ch*sizeof(pixel), f );
        x264_free( planeu );
    }
    fclose( f );
}

/* Fill "default" values */
static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
                                    x264_sps_t *sps, x264_pps_t *pps,
                                    int i_idr_pic_id, int i_frame, int i_qp )
{
    x264_param_t *param = &h->param;

    /* First we fill all fields */
    sh->sps = sps;
    sh->pps = pps;

    sh->i_first_mb  = 0;
    sh->i_last_mb   = h->mb.i_mb_count - 1;
    sh->i_pps_id    = pps->i_id;

    sh->i_frame_num = i_frame;

    sh->b_mbaff = PARAM_INTERLACED;
    sh->b_field_pic = 0;    /* no field support for now */
    sh->b_bottom_field = 0; /* not yet used */

    sh->i_idr_pic_id = i_idr_pic_id;

    /* poc stuff, fixed later */
    sh->i_poc = 0;
    sh->i_delta_poc_bottom = 0;
    sh->i_delta_poc[0] = 0;
    sh->i_delta_poc[1] = 0;

    sh->i_redundant_pic_cnt = 0;

    h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
                                && h->param.i_bframe
                                && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );

    if( !h->mb.b_direct_auto_read && sh->i_type == SLICE_TYPE_B )
    {
        if( h->fref[1][0]->i_poc_l0ref0 == h->fref[0][0]->i_poc )
        {
            if( h->mb.b_direct_auto_write )
                sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
            else
                sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
        }
        else
        {
            h->mb.b_direct_auto_write = 0;
            sh->b_direct_spatial_mv_pred = 1;
        }
    }
    /* else b_direct_spatial_mv_pred was read from the 2pass statsfile */

    sh->b_num_ref_idx_override = 0;
    sh->i_num_ref_idx_l0_active = 1;
    sh->i_num_ref_idx_l1_active = 1;

    sh->b_ref_pic_list_reordering[0] = h->b_ref_reorder[0];
    sh->b_ref_pic_list_reordering[1] = h->b_ref_reorder[1];

    /* If the ref list isn't in the default order, construct reordering header */
    for( int list = 0; list < 2; list++ )
    {
        if( sh->b_ref_pic_list_reordering[list] )
        {
            int pred_frame_num = i_frame;
            for( int i = 0; i < h->i_ref[list]; i++ )
            {
                int diff = h->fref[list][i]->i_frame_num - pred_frame_num;
                sh->ref_pic_list_order[list][i].idc = ( diff > 0 );
                sh->ref_pic_list_order[list][i].arg = (abs(diff) - 1) & ((1 << sps->i_log2_max_frame_num) - 1);
                pred_frame_num = h->fref[list][i]->i_frame_num;
            }
        }
    }

    sh->i_cabac_init_idc = param->i_cabac_init_idc;

    sh->i_qp = SPEC_QP(i_qp);
    sh->i_qp_delta = sh->i_qp - pps->i_pic_init_qp;
    sh->b_sp_for_swidth = 0;
    sh->i_qs_delta = 0;

    int deblock_thresh = i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta);
    /* If effective qp <= 15, deblocking would have no effect anyway */
    if( param->b_deblocking_filter && (h->mb.b_variable_qp || 15 < deblock_thresh ) )
        sh->i_disable_deblocking_filter_idc = param->b_sliced_threads ? 2 : 0;
    else
        sh->i_disable_deblocking_filter_idc = 1;
    sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
    sh->i_beta_offset = param->i_deblocking_filter_beta << 1;
}

static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal_ref_idc )
{
    if( sh->b_mbaff )
    {
        int first_x = sh->i_first_mb % sh->sps->i_mb_width;
        int first_y = sh->i_first_mb / sh->sps->i_mb_width;
        assert( (first_y&1) == 0 );
        bs_write_ue( s, (2*first_x + sh->sps->i_mb_width*(first_y&~1) + (first_y&1)) >> 1 );
    }
    else
        bs_write_ue( s, sh->i_first_mb );

    bs_write_ue( s, sh->i_type + 5 );   /* same type things */
    bs_write_ue( s, sh->i_pps_id );
    bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num & ((1<<sh->sps->i_log2_max_frame_num)-1) );

    if( !sh->sps->b_frame_mbs_only )
    {
        bs_write1( s, sh->b_field_pic );
        if( sh->b_field_pic )
            bs_write1( s, sh->b_bottom_field );
    }

    if( sh->i_idr_pic_id >= 0 ) /* NAL IDR */
        bs_write_ue( s, sh->i_idr_pic_id );

    if( sh->sps->i_poc_type == 0 )
    {
        bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc & ((1<<sh->sps->i_log2_max_poc_lsb)-1) );
        if( sh->pps->b_pic_order && !sh->b_field_pic )
            bs_write_se( s, sh->i_delta_poc_bottom );
    }

    if( sh->pps->b_redundant_pic_cnt )
        bs_write_ue( s, sh->i_redundant_pic_cnt );

    if( sh->i_type == SLICE_TYPE_B )
        bs_write1( s, sh->b_direct_spatial_mv_pred );

    if( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_B )
    {
        bs_write1( s, sh->b_num_ref_idx_override );
        if( sh->b_num_ref_idx_override )
        {
            bs_write_ue( s, sh->i_num_ref_idx_l0_active - 1 );
            if( sh->i_type == SLICE_TYPE_B )
                bs_write_ue( s, sh->i_num_ref_idx_l1_active - 1 );
        }
    }

    /* ref pic list reordering */
    if( sh->i_type != SLICE_TYPE_I )
    {
        bs_write1( s, sh->b_ref_pic_list_reordering[0] );
        if( sh->b_ref_pic_list_reordering[0] )
        {
            for( int i = 0; i < sh->i_num_ref_idx_l0_active; i++ )
            {
                bs_write_ue( s, sh->ref_pic_list_order[0][i].idc );
                bs_write_ue( s, sh->ref_pic_list_order[0][i].arg );
            }
            bs_write_ue( s, 3 );
        }
    }
    if( sh->i_type == SLICE_TYPE_B )
    {
        bs_write1( s, sh->b_ref_pic_list_reordering[1] );
        if( sh->b_ref_pic_list_reordering[1] )
        {
            for( int i = 0; i < sh->i_num_ref_idx_l1_active; i++ )
            {
                bs_write_ue( s, sh->ref_pic_list_order[1][i].idc );
                bs_write_ue( s, sh->ref_pic_list_order[1][i].arg );
            }
            bs_write_ue( s, 3 );
        }
    }

    if( sh->pps->b_weighted_pred && sh->i_type == SLICE_TYPE_P )
    {
        /* pred_weight_table() */
        bs_write_ue( s, sh->weight[0][0].i_denom );
        bs_write_ue( s, sh->weight[0][1].i_denom );
        for( int i = 0; i < sh->i_num_ref_idx_l0_active; i++ )
        {
            int luma_weight_l0_flag = !!sh->weight[i][0].weightfn;
            int chroma_weight_l0_flag = !!sh->weight[i][1].weightfn || !!sh->weight[i][2].weightfn;
            bs_write1( s, luma_weight_l0_flag );
            if( luma_weight_l0_flag )
            {
                bs_write_se( s, sh->weight[i][0].i_scale );
                bs_write_se( s, sh->weight[i][0].i_offset );
            }
            bs_write1( s, chroma_weight_l0_flag );
            if( chroma_weight_l0_flag )
            {
                for( int j = 1; j < 3; j++ )
                {
                    bs_write_se( s, sh->weight[i][j].i_scale );
                    bs_write_se( s, sh->weight[i][j].i_offset );
                }
            }
        }
    }
    else if( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B )
    {
      /* TODO */
    }

    if( i_nal_ref_idc != 0 )
    {
        if( sh->i_idr_pic_id >= 0 )
        {
            bs_write1( s, 0 );  /* no output of prior pics flag */
            bs_write1( s, 0 );  /* long term reference flag */
        }
        else
        {
            bs_write1( s, sh->i_mmco_command_count > 0 ); /* adaptive_ref_pic_marking_mode_flag */
            if( sh->i_mmco_command_count > 0 )
            {
                for( int i = 0; i < sh->i_mmco_command_count; i++ )
                {
                    bs_write_ue( s, 1 ); /* mark short term ref as unused */
                    bs_write_ue( s, sh->mmco[i].i_difference_of_pic_nums - 1 );
                }
                bs_write_ue( s, 0 ); /* end command list */
            }
        }
    }

    if( sh->pps->b_cabac && sh->i_type != SLICE_TYPE_I )
        bs_write_ue( s, sh->i_cabac_init_idc );

    bs_write_se( s, sh->i_qp_delta );      /* slice qp delta */

    if( sh->pps->b_deblocking_filter_control )
    {
        bs_write_ue( s, sh->i_disable_deblocking_filter_idc );
        if( sh->i_disable_deblocking_filter_idc != 1 )
        {
            bs_write_se( s, sh->i_alpha_c0_offset >> 1 );
            bs_write_se( s, sh->i_beta_offset >> 1 );
        }
    }
}

/* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */
/* reallocate, adding an arbitrary amount of space (100 kilobytes). */
static int x264_bitstream_check_buffer( x264_t *h )
{
    uint8_t *bs_bak = h->out.p_bitstream;
    int max_mb_size = 2500 << SLICE_MBAFF;
    if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < max_mb_size)) ||
        (h->out.bs.p_end - h->out.bs.p < max_mb_size) )
    {
        h->out.i_bitstream += 100000;
        CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream );
        h->mc.memcpy_aligned( h->out.p_bitstream, bs_bak, (h->out.i_bitstream - 100000) & ~15 );
        intptr_t delta = h->out.p_bitstream - bs_bak;

        h->out.bs.p_start += delta;
        h->out.bs.p += delta;
        h->out.bs.p_end = h->out.p_bitstream + h->out.i_bitstream;

        h->cabac.p_start += delta;
        h->cabac.p += delta;
        h->cabac.p_end = h->out.p_bitstream + h->out.i_bitstream;

        for( int i = 0; i <= h->out.i_nal; i++ )
            h->out.nal[i].p_payload += delta;
        x264_free( bs_bak );
    }
    return 0;
fail:
    x264_free( bs_bak );
    return -1;
}

#if HAVE_THREAD
static void x264_encoder_thread_init( x264_t *h )
{
    if( h->param.i_sync_lookahead )
        x264_lower_thread_priority( 10 );

#if HAVE_MMX
    /* Misalign mask has to be set separately for each thread. */
    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
        x264_cpu_mask_misalign_sse();
#endif
}
#endif

/****************************************************************************
 *
 ****************************************************************************
 ****************************** External API*********************************
 ****************************************************************************
 *
 ****************************************************************************/

static int x264_validate_parameters( x264_t *h, int b_open )
{
#if HAVE_MMX
#ifdef __SSE__
    if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) )
    {
        x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
#else
    if( b_open && !(x264_cpu_detect() & X264_CPU_MMX2) )
    {
        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
#endif
        x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
        return -1;
    }
#endif

#if HAVE_INTERLACED
    h->param.b_interlaced = !!PARAM_INTERLACED;
#else
    if( h->param.b_interlaced )
    {
        x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" );
        return -1;
    }
#endif

    if( h->param.i_width <= 0 || h->param.i_height <= 0 )
    {
        x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
                  h->param.i_width, h->param.i_height );
        return -1;
    }

    int i_csp = h->param.i_csp & X264_CSP_MASK;
#if X264_CHROMA_FORMAT
    if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp <= X264_CSP_NV12 )
    {
        x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
        return -1;
    }
    else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_NV16 )
    {
        x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
        return -1;
    }
    else if( CHROMA_FORMAT != CHROMA_444 && i_csp >= X264_CSP_I444 && i_csp <= X264_CSP_RGB )
    {
        x264_log( h, X264_LOG_ERROR, "not compiled with 4:4:4 support\n" );
        return -1;
    }
#endif
    if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
    {
        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
        return -1;
    }

    if( i_csp < X264_CSP_I444 && h->param.i_width % 2 )
    {
        x264_log( h, X264_LOG_ERROR, "width not divisible by 2 (%dx%d)\n",
                  h->param.i_width, h->param.i_height );
        return -1;
    }

    if( i_csp < X264_CSP_I422 && PARAM_INTERLACED && h->param.i_height % 4 )
    {
        x264_log( h, X264_LOG_ERROR, "height not divisible by 4 (%dx%d)\n",
                  h->param.i_width, h->param.i_height );
        return -1;
    }

    if( (i_csp < X264_CSP_I422 || PARAM_INTERLACED) && h->param.i_height % 2 )
    {
        x264_log( h, X264_LOG_ERROR, "height not divisible by 2 (%dx%d)\n",
                  h->param.i_width, h->param.i_height );
        return -1;
    }

    if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width ||
        (h->param.crop_rect.i_top  + h->param.crop_rect.i_bottom) >= h->param.i_height )
    {
        x264_log( h, X264_LOG_ERROR, "invalid crop-rect %u,%u,%u,%u\n", h->param.crop_rect.i_left,
                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom );
        return -1;
    }

    if( h->param.i_threads == X264_THREADS_AUTO )
        h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
    h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
    if( h->param.i_threads > 1 )
    {
#if !HAVE_THREAD
        x264_log( h, X264_LOG_WARNING, "not compiled with thread support!\n");
        h->param.i_threads = 1;
#endif
        /* Avoid absurdly small thread slices as they can reduce performance
         * and VBV compliance.  Capped at an arbitrary 4 rows per thread. */
        if( h->param.b_sliced_threads )
        {
            int max_threads = (h->param.i_height+15)/16 / 4;
            h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
        }
    }
    else
        h->param.b_sliced_threads = 0;
    h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
    if( h->i_thread_frames > 1 )
        h->param.nalu_process = NULL;

    h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
    if( h->param.i_keyint_max == 1 )
    {
        h->param.b_intra_refresh = 0;
        h->param.analyse.i_weighted_pred = 0;
    }

    h->param.i_frame_packing = x264_clip3( h->param.i_frame_packing, -1, 5 );

    /* Detect default ffmpeg settings and terminate with an error. */
    if( b_open )
    {
        int score = 0;
        score += h->param.analyse.i_me_range == 0;
        score += h->param.rc.i_qp_step == 3;
        score += h->param.i_keyint_max == 12;
        score += h->param.rc.i_qp_min == 2;
        score += h->param.rc.i_qp_max == 31;
        score += h->param.rc.f_qcompress == 0.5;
        score += fabs(h->param.rc.f_ip_factor - 1.25) < 0.01;
        score += fabs(h->param.rc.f_pb_factor - 1.25) < 0.01;
        score += h->param.analyse.inter == 0 && h->param.analyse.i_subpel_refine == 8;
        if( score >= 5 )
        {
            x264_log( h, X264_LOG_ERROR, "broken ffmpeg default settings detected\n" );
            x264_log( h, X264_LOG_ERROR, "use an encoding preset (e.g. -vpre medium)\n" );
            x264_log( h, X264_LOG_ERROR, "preset usage: -vpre <speed> -vpre <profile>\n" );
            x264_log( h, X264_LOG_ERROR, "speed presets are listed in x264 --help\n" );
            x264_log( h, X264_LOG_ERROR, "profile is optional; x264 defaults to high\n" );
            return -1;
        }
    }

    if( h->param.rc.i_rc_method < 0 || h->param.rc.i_rc_method > 2 )
    {
        x264_log( h, X264_LOG_ERROR, "no ratecontrol method specified\n" );
        return -1;
    }
    h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, -QP_BD_OFFSET, 51 );
    h->param.rc.f_rf_constant_max = x264_clip3f( h->param.rc.f_rf_constant_max, -QP_BD_OFFSET, 51 );
    h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX );
    h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 11 );
    h->param.rc.f_ip_factor = X264_MAX( h->param.rc.f_ip_factor, 0.01f );
    h->param.rc.f_pb_factor = X264_MAX( h->param.rc.f_pb_factor, 0.01f );
    if( h->param.rc.i_rc_method == X264_RC_CRF )
    {
        h->param.rc.i_qp_constant = h->param.rc.f_rf_constant + QP_BD_OFFSET;
        h->param.rc.i_bitrate = 0;
    }
    if( (h->param.rc.i_rc_method == X264_RC_CQP || h->param.rc.i_rc_method == X264_RC_CRF)
        && h->param.rc.i_qp_constant == 0 )
    {
        h->mb.b_lossless = 1;
        h->param.i_cqm_preset = X264_CQM_FLAT;
        h->param.psz_cqm_file = NULL;
        h->param.rc.i_rc_method = X264_RC_CQP;
        h->param.rc.f_ip_factor = 1;
        h->param.rc.f_pb_factor = 1;
        h->param.analyse.b_psnr = 0;
        h->param.analyse.b_ssim = 0;
        h->param.analyse.i_chroma_qp_offset = 0;
        h->param.analyse.i_trellis = 0;
        h->param.analyse.b_fast_pskip = 0;
        h->param.analyse.i_noise_reduction = 0;
        h->param.analyse.b_psy = 0;
        h->param.i_bframe = 0;
        /* 8x8dct is not useful without RD in CAVLC lossless */
        if( !h->param.b_cabac && h->param.analyse.i_subpel_refine < 6 )
            h->param.analyse.b_transform_8x8 = 0;
    }
    if( h->param.rc.i_rc_method == X264_RC_CQP )
    {
        float qp_p = h->param.rc.i_qp_constant;
        float qp_i = qp_p - 6*log2f( h->param.rc.f_ip_factor );
        float qp_b = qp_p + 6*log2f( h->param.rc.f_pb_factor );
        h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, QP_MAX );
        h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, QP_MAX );
        h->param.rc.i_aq_mode = 0;
        h->param.rc.b_mb_tree = 0;
        h->param.rc.i_bitrate = 0;
    }
    h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, QP_MAX );
    h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
    h->param.rc.i_qp_step = x264_clip3( h->param.rc.i_qp_step, 0, QP_MAX );
    h->param.rc.i_bitrate = x264_clip3( h->param.rc.i_bitrate, 0, 2000000 );
    h->param.rc.i_vbv_buffer_size = x264_clip3( h->param.rc.i_vbv_buffer_size, 0, 2000000 );
    h->param.rc.i_vbv_max_bitrate = x264_clip3( h->param.rc.i_vbv_max_bitrate, 0, 2000000 );
    h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init, 0, 2000000 );
    if( h->param.rc.i_vbv_buffer_size )
    {
        if( h->param.rc.i_rc_method == X264_RC_CQP )
        {
            x264_log( h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n" );
            h->param.rc.i_vbv_max_bitrate = 0;
            h->param.rc.i_vbv_buffer_size = 0;
        }
        else if( h->param.rc.i_vbv_max_bitrate == 0 )
        {
            if( h->param.rc.i_rc_method == X264_RC_ABR )
            {
                x264_log( h, X264_LOG_WARNING, "VBV maxrate unspecified, assuming CBR\n" );
                h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
            }
            else
            {
                x264_log( h, X264_LOG_WARNING, "VBV bufsize set but maxrate unspecified, ignored\n" );
                h->param.rc.i_vbv_buffer_size = 0;
            }
        }
        else if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
                 h->param.rc.i_rc_method == X264_RC_ABR )
        {
            x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" );
            h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
        }
    }
    else if( h->param.rc.i_vbv_max_bitrate )
    {
        x264_log( h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize, ignored\n" );
        h->param.rc.i_vbv_max_bitrate = 0;
    }

    h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
    h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );

    int max_slices = (h->param.i_height+((16<<PARAM_INTERLACED)-1))/(16<<PARAM_INTERLACED);
    if( h->param.b_sliced_threads )
        h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
    else
    {
        h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
        if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
            h->param.i_slice_count = 0;
    }

    if( h->param.b_bluray_compat )
    {
        h->param.i_bframe_pyramid = X264_MIN( X264_B_PYRAMID_STRICT, h->param.i_bframe_pyramid );
        h->param.i_bframe = X264_MIN( h->param.i_bframe, 3 );
        h->param.b_aud = 1;
        h->param.i_nal_hrd = X264_MAX( h->param.i_nal_hrd, X264_NAL_HRD_VBR );
        h->param.i_slice_max_size = 0;
        h->param.i_slice_max_mbs = 0;
        h->param.b_intra_refresh = 0;
        h->param.i_frame_reference = X264_MIN( h->param.i_frame_reference, 6 );
        h->param.i_dpb_size = X264_MIN( h->param.i_dpb_size, 6 );
        /* Due to the proliferation of broken players that don't handle dupes properly. */
        h->param.analyse.i_weighted_pred = X264_MIN( h->param.analyse.i_weighted_pred, X264_WEIGHTP_SIMPLE );
        if( h->param.b_fake_interlaced )
            h->param.b_pic_struct = 1;
    }

    h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, X264_REF_MAX );
    h->param.i_dpb_size = x264_clip3( h->param.i_dpb_size, 1, X264_REF_MAX );
    if( h->param.i_scenecut_threshold < 0 )
        h->param.i_scenecut_threshold = 0;
    h->param.analyse.i_direct_mv_pred = x264_clip3( h->param.analyse.i_direct_mv_pred, X264_DIRECT_PRED_NONE, X264_DIRECT_PRED_AUTO );
    if( !h->param.analyse.i_subpel_refine && h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
    {
        x264_log( h, X264_LOG_WARNING, "subme=0 + direct=temporal is not supported\n" );
        h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
    }
    h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_MIN( X264_BFRAME_MAX, h->param.i_keyint_max-1 ) );
    h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 );
    if( h->param.i_bframe <= 1 )
        h->param.i_bframe_pyramid = X264_B_PYRAMID_NONE;
    h->param.i_bframe_pyramid = x264_clip3( h->param.i_bframe_pyramid, X264_B_PYRAMID_NONE, X264_B_PYRAMID_NORMAL );
    h->param.i_bframe_adaptive = x264_clip3( h->param.i_bframe_adaptive, X264_B_ADAPT_NONE, X264_B_ADAPT_TRELLIS );
    if( !h->param.i_bframe )
    {
        h->param.i_bframe_adaptive = X264_B_ADAPT_NONE;
        h->param.analyse.i_direct_mv_pred = 0;
        h->param.analyse.b_weighted_bipred = 0;
        h->param.b_open_gop = 0;
    }
    if( h->param.b_intra_refresh && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL )
    {
        x264_log( h, X264_LOG_WARNING, "b-pyramid normal + intra-refresh is not supported\n" );
        h->param.i_bframe_pyramid = X264_B_PYRAMID_STRICT;
    }
    if( h->param.b_intra_refresh && (h->param.i_frame_reference > 1 || h->param.i_dpb_size > 1) )
    {
        x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
        h->param.i_frame_reference = 1;
        h->param.i_dpb_size = 1;
    }
    if( h->param.b_intra_refresh && h->param.b_open_gop )
    {
        x264_log( h, X264_LOG_WARNING, "intra-refresh is not compatible with open-gop\n" );
        h->param.b_open_gop = 0;
    }
    float fps = h->param.i_fps_num > 0 && h->param.i_fps_den > 0 ? (float) h->param.i_fps_num / h->param.i_fps_den : 25.0;
    if( h->param.i_keyint_min == X264_KEYINT_MIN_AUTO )
        h->param.i_keyint_min = X264_MIN( h->param.i_keyint_max / 10, fps );
    h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
    h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
    {
        int maxrate = X264_MAX( h->param.rc.i_vbv_max_bitrate, h->param.rc.i_bitrate );
        float bufsize = maxrate ? (float)h->param.rc.i_vbv_buffer_size / maxrate : 0;
        h->param.rc.i_lookahead = X264_MIN( h->param.rc.i_lookahead, X264_MAX( h->param.i_keyint_max, bufsize*fps ) );
    }

    if( !h->param.i_timebase_num || !h->param.i_timebase_den || !(h->param.b_vfr_input || h->param.b_pulldown) )
    {
        h->param.i_timebase_num = h->param.i_fps_den;
        h->param.i_timebase_den = h->param.i_fps_num;
    }

    h->param.rc.f_qcompress = x264_clip3f( h->param.rc.f_qcompress, 0.0, 1.0 );
    if( h->param.i_keyint_max == 1 || h->param.rc.f_qcompress == 1 )
        h->param.rc.b_mb_tree = 0;
    if( (!h->param.b_intra_refresh && h->param.i_keyint_max != X264_KEYINT_MAX_INFINITE) &&
        !h->param.rc.i_lookahead && h->param.rc.b_mb_tree )
    {
        x264_log( h, X264_LOG_WARNING, "lookaheadless mb-tree requires intra refresh or infinite keyint\n" );
        h->param.rc.b_mb_tree = 0;
    }
    if( h->param.rc.b_stat_read )
        h->param.rc.i_lookahead = 0;
#if HAVE_THREAD
    if( h->param.i_sync_lookahead < 0 )
        h->param.i_sync_lookahead = h->param.i_bframe + 1;
    h->param.i_sync_lookahead = X264_MIN( h->param.i_sync_lookahead, X264_LOOKAHEAD_MAX );
    if( h->param.rc.b_stat_read || h->i_thread_frames == 1 )
        h->param.i_sync_lookahead = 0;
#else
    h->param.i_sync_lookahead = 0;
#endif

    h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 );
    h->param.i_deblocking_filter_beta    = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
    h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 );
    h->param.analyse.i_luma_deadzone[1] = x264_clip3( h->param.analyse.i_luma_deadzone[1], 0, 32 );

    h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, 0, 2 );

    if( h->param.i_cqm_preset < X264_CQM_FLAT || h->param.i_cqm_preset > X264_CQM_CUSTOM )
        h->param.i_cqm_preset = X264_CQM_FLAT;

    if( h->param.analyse.i_me_method < X264_ME_DIA ||
        h->param.analyse.i_me_method > X264_ME_TESA )
        h->param.analyse.i_me_method = X264_ME_HEX;
    h->param.analyse.i_me_range = x264_clip3( h->param.analyse.i_me_range, 4, 1024 );
    if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX )
        h->param.analyse.i_me_range = 16;
    if( h->param.analyse.i_me_method == X264_ME_TESA &&
        (h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) )
        h->param.analyse.i_me_method = X264_ME_ESA;
    h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1;
    h->param.analyse.inter &= X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8|X264_ANALYSE_BSUB16x16|
                              X264_ANALYSE_I4x4|X264_ANALYSE_I8x8;
    h->param.analyse.intra &= X264_ANALYSE_I4x4|X264_ANALYSE_I8x8;
    if( !(h->param.analyse.inter & X264_ANALYSE_PSUB16x16) )
        h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
    if( !h->param.analyse.b_transform_8x8 )
    {
        h->param.analyse.inter &= ~X264_ANALYSE_I8x8;
        h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
    }
    h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
    h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
    if( h->param.rc.f_aq_strength == 0 )
        h->param.rc.i_aq_mode = 0;

    if( h->param.i_log_level < X264_LOG_INFO )
    {
        h->param.analyse.b_psnr = 0;
        h->param.analyse.b_ssim = 0;
    }
    /* Warn users trying to measure PSNR/SSIM with psy opts on. */
    if( b_open && (h->param.analyse.b_psnr || h->param.analyse.b_ssim) )
    {
        char *s = NULL;

        if( h->param.analyse.b_psy )
        {
            s = h->param.analyse.b_psnr ? "psnr" : "ssim";
            x264_log( h, X264_LOG_WARNING, "--%s used with psy on: results will be invalid!\n", s );
        }
        else if( !h->param.rc.i_aq_mode && h->param.analyse.b_ssim )
        {
            x264_log( h, X264_LOG_WARNING, "--ssim used with AQ off: results will be invalid!\n" );
            s = "ssim";
        }
        else if(  h->param.rc.i_aq_mode && h->param.analyse.b_psnr )
        {
            x264_log( h, X264_LOG_WARNING, "--psnr used with AQ on: results will be invalid!\n" );
            s = "psnr";
        }
        if( s )
            x264_log( h, X264_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s );
    }

    if( !h->param.analyse.b_psy )
    {
        h->param.analyse.f_psy_rd = 0;
        h->param.analyse.f_psy_trellis = 0;
    }
    h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
    h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 );
    h->mb.i_psy_rd = h->param.analyse.i_subpel_refine >= 6 ? FIX8( h->param.analyse.f_psy_rd ) : 0;
    h->mb.i_psy_trellis = h->param.analyse.i_trellis ? FIX8( h->param.analyse.f_psy_trellis / 4 ) : 0;
    h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -32, 32);
    /* In 4:4:4 mode, chroma gets twice as much resolution, so we can halve its quality. */
    if( b_open && i_csp >= X264_CSP_I444 && i_csp < X264_CSP_BGR && h->param.analyse.b_psy )
        h->param.analyse.i_chroma_qp_offset += 6;
    /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */
    /* so we lower the chroma QP offset to compensate */
    if( b_open && h->mb.i_psy_rd )
        h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2;
    /* Psy trellis has a similar effect. */
    if( b_open && h->mb.i_psy_trellis )
        h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
    h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
    /* MB-tree requires AQ to be on, even if the strength is zero. */
    if( !h->param.rc.i_aq_mode && h->param.rc.b_mb_tree )
    {
        h->param.rc.i_aq_mode = 1;
        h->param.rc.f_aq_strength = 0;
    }
    h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 );
    if( h->param.analyse.i_subpel_refine >= 10 && (h->param.analyse.i_trellis != 2 || !h->param.rc.i_aq_mode) )
        h->param.analyse.i_subpel_refine = 9;

    {
        const x264_level_t *l = x264_levels;
        if( h->param.i_level_idc < 0 )
        {
            int maxrate_bak = h->param.rc.i_vbv_max_bitrate;
            if( h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.i_vbv_buffer_size <= 0 )
                h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate * 2;
            x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
            do h->param.i_level_idc = l->level_idc;
                while( l[1].level_idc && x264_validate_levels( h, 0 ) && l++ );
            h->param.rc.i_vbv_max_bitrate = maxrate_bak;
        }
        else
        {
            while( l->level_idc && l->level_idc != h->param.i_level_idc )
                l++;
            if( l->level_idc == 0 )
            {
                x264_log( h, X264_LOG_ERROR, "invalid level_idc: %d\n", h->param.i_level_idc );
                return -1;
            }
        }
        if( h->param.analyse.i_mv_range <= 0 )
            h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED;
        else
            h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED);
    }

    h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );

    if( PARAM_INTERLACED )
    {
        if( h->param.analyse.i_me_method >= X264_ME_ESA )
        {
            x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
            h->param.analyse.i_me_method = X264_ME_UMH;
        }
        if( h->param.analyse.i_weighted_pred > 0 )
        {
            x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
            h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE;
        }
    }

    if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy )
        h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;

    if( h->i_thread_frames > 1 )
    {
        int r = h->param.analyse.i_mv_range_thread;
        int r2;
        if( r <= 0 )
        {
            // half of the available space is reserved and divided evenly among the threads,
            // the rest is allocated to whichever thread is far enough ahead to use it.
            // reserving more space increases quality for some videos, but costs more time
            // in thread synchronization.
            int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->i_thread_frames - X264_THREAD_HEIGHT;
            r = max_range / 2;
        }
        r = X264_MAX( r, h->param.analyse.i_me_range );
        r = X264_MIN( r, h->param.analyse.i_mv_range );
        // round up to use the whole mb row
        r2 = (r & ~15) + ((-X264_THREAD_HEIGHT) & 15);
        if( r2 < r )
            r2 += 16;
        x264_log( h, X264_LOG_DEBUG, "using mv_range_thread = %d\n", r2 );
        h->param.analyse.i_mv_range_thread = r2;
    }

    if( h->param.rc.f_rate_tolerance < 0 )
        h->param.rc.f_rate_tolerance = 0;
    if( h->param.rc.f_qblur < 0 )
        h->param.rc.f_qblur = 0;
    if( h->param.rc.f_complexity_blur < 0 )
        h->param.rc.f_complexity_blur = 0;

    h->param.i_sps_id &= 31;

    if( PARAM_INTERLACED )
        h->param.b_pic_struct = 1;

    h->param.i_nal_hrd = x264_clip3( h->param.i_nal_hrd, X264_NAL_HRD_NONE, X264_NAL_HRD_CBR );

    if( h->param.i_nal_hrd && !h->param.rc.i_vbv_buffer_size )
    {
        x264_log( h, X264_LOG_WARNING, "NAL HRD parameters require VBV parameters\n" );
        h->param.i_nal_hrd = X264_NAL_HRD_NONE;
    }

    if( h->param.i_nal_hrd == X264_NAL_HRD_CBR &&
       (h->param.rc.i_bitrate != h->param.rc.i_vbv_max_bitrate || !h->param.rc.i_vbv_max_bitrate) )
    {
        x264_log( h, X264_LOG_WARNING, "CBR HRD requires constant bitrate\n" );
        h->param.i_nal_hrd = X264_NAL_HRD_VBR;
    }

    /* ensure the booleans are 0 or 1 so they can be used in math */
#define BOOLIFY(x) h->param.x = !!h->param.x
    BOOLIFY( b_cabac );
    BOOLIFY( b_constrained_intra );
    BOOLIFY( b_deblocking_filter );
    BOOLIFY( b_deterministic );
    BOOLIFY( b_sliced_threads );
    BOOLIFY( b_interlaced );
    BOOLIFY( b_intra_refresh );
    BOOLIFY( b_visualize );
    BOOLIFY( b_aud );
    BOOLIFY( b_repeat_headers );
    BOOLIFY( b_annexb );
    BOOLIFY( b_vfr_input );
    BOOLIFY( b_pulldown );
    BOOLIFY( b_tff );
    BOOLIFY( b_pic_struct );
    BOOLIFY( b_fake_interlaced );
    BOOLIFY( b_open_gop );
    BOOLIFY( b_bluray_compat );
    BOOLIFY( analyse.b_transform_8x8 );
    BOOLIFY( analyse.b_weighted_bipred );
    BOOLIFY( analyse.b_chroma_me );
    BOOLIFY( analyse.b_mixed_references );
    BOOLIFY( analyse.b_fast_pskip );
    BOOLIFY( analyse.b_dct_decimate );
    BOOLIFY( analyse.b_psy );
    BOOLIFY( analyse.b_psnr );
    BOOLIFY( analyse.b_ssim );
    BOOLIFY( rc.b_stat_write );
    BOOLIFY( rc.b_stat_read );
    BOOLIFY( rc.b_mb_tree );
#undef BOOLIFY

    return 0;
}

static void mbcmp_init( x264_t *h )
{
    int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
    memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
    memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
    h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
    h->pixf.intra_mbcmp_x3_8x16c = satd ? h->pixf.intra_satd_x3_8x16c : h->pixf.intra_sad_x3_8x16c;
    h->pixf.intra_mbcmp_x3_8x8c  = satd ? h->pixf.intra_satd_x3_8x8c  : h->pixf.intra_sad_x3_8x8c;
    h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
    h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
    h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
                               : satd ? h->pixf.intra_satd_x9_4x4 : h->pixf.intra_sad_x9_4x4;
    h->pixf.intra_mbcmp_x9_8x8 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
                               : satd ? h->pixf.intra_sa8d_x9_8x8 : h->pixf.intra_sad_x9_8x8;
    satd &= h->param.analyse.i_me_method == X264_ME_TESA;
    memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
    memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
    memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
}

static void chroma_dsp_init( x264_t *h )
{
    memcpy( h->luma2chroma_pixel, x264_luma2chroma_pixel[CHROMA_FORMAT], sizeof(h->luma2chroma_pixel) );

    switch( CHROMA_FORMAT )
    {
        case CHROMA_420:
            memcpy( h->predict_chroma, h->predict_8x8c, sizeof(h->predict_chroma) );
            h->mc.prefetch_fenc = h->mc.prefetch_fenc_420;
            h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_420;
            h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_420_intra;
            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_420_mbaff;
            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_420_intra_mbaff;
            h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x8c;
            h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last4;
            h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run4;
            break;
        case CHROMA_422:
            memcpy( h->predict_chroma, h->predict_8x16c, sizeof(h->predict_chroma) );
            h->mc.prefetch_fenc = h->mc.prefetch_fenc_422;
            h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_422;
            h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_422_intra;
            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_422_mbaff;
            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_422_intra_mbaff;
            h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x16c;
            h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last8;
            h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run8;
            break;
        case CHROMA_444:
            h->mc.prefetch_fenc = h->mc.prefetch_fenc_422; /* FIXME: doesn't cover V plane */
            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_luma_mbaff;
            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_luma_intra_mbaff;
            break;
    }
}

static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
{
    /* VUI */
    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
    {
        uint32_t i_w = param->vui.i_sar_width;
        uint32_t i_h = param->vui.i_sar_height;
        uint32_t old_w = h->param.vui.i_sar_width;
        uint32_t old_h = h->param.vui.i_sar_height;

        x264_reduce_fraction( &i_w, &i_h );

        while( i_w > 65535 || i_h > 65535 )
        {
            i_w /= 2;
            i_h /= 2;
        }

        x264_reduce_fraction( &i_w, &i_h );

        if( i_w != old_w || i_h != old_h || initial )
        {
            h->param.vui.i_sar_width = 0;
            h->param.vui.i_sar_height = 0;
            if( i_w == 0 || i_h == 0 )
                x264_log( h, X264_LOG_WARNING, "cannot create valid sample aspect ratio\n" );
            else
            {
                x264_log( h, initial?X264_LOG_INFO:X264_LOG_DEBUG, "using SAR=%d/%d\n", i_w, i_h );
                h->param.vui.i_sar_width = i_w;
                h->param.vui.i_sar_height = i_h;
            }
            x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
        }
    }
}

/****************************************************************************
 * x264_encoder_open:
 ****************************************************************************/
x264_t *x264_encoder_open( x264_param_t *param )
{
    x264_t *h;
    char buf[1000], *p;
    int qp, i_slicetype_length;

    CHECKED_MALLOCZERO( h, sizeof(x264_t) );

    /* Create a copy of param */
    memcpy( &h->param, param, sizeof(x264_param_t) );

    if( param->param_free )
        param->param_free( param );

    if( x264_threading_init() )
    {
        x264_log( h, X264_LOG_ERROR, "unable to initialize threading\n" );
        goto fail;
    }

    if( x264_validate_parameters( h, 1 ) < 0 )
        goto fail;

    if( h->param.psz_cqm_file )
        if( x264_cqm_parse_file( h, h->param.psz_cqm_file ) < 0 )
            goto fail;

    if( h->param.rc.psz_stat_out )
        h->param.rc.psz_stat_out = strdup( h->param.rc.psz_stat_out );
    if( h->param.rc.psz_stat_in )
        h->param.rc.psz_stat_in = strdup( h->param.rc.psz_stat_in );

    x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den );
    x264_reduce_fraction( &h->param.i_timebase_num, &h->param.i_timebase_den );

    /* Init x264_t */
    h->i_frame = -1;
    h->i_frame_num = 0;
    h->i_idr_pic_id = 0;

    if( (uint64_t)h->param.i_timebase_den * 2 > UINT32_MAX )
    {
        x264_log( h, X264_LOG_ERROR, "Effective timebase denominator %u exceeds H.264 maximum\n", h->param.i_timebase_den );
        goto fail;
    }

    x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
    x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps );

    x264_set_aspect_ratio( h, &h->param, 1 );

    x264_validate_levels( h, 1 );

    h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;

    if( x264_cqm_init( h ) < 0 )
        goto fail;

    h->mb.i_mb_width = h->sps->i_mb_width;
    h->mb.i_mb_height = h->sps->i_mb_height;
    h->mb.i_mb_count = h->mb.i_mb_width * h->mb.i_mb_height;

    h->mb.chroma_h_shift = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422;
    h->mb.chroma_v_shift = CHROMA_FORMAT == CHROMA_420;

    /* Adaptive MBAFF and subme 0 are not supported as we require halving motion
     * vectors during prediction, resulting in hpel mvs.
     * The chosen solution is to make MBAFF non-adaptive in this case. */
    h->mb.b_adaptive_mbaff = PARAM_INTERLACED && h->param.analyse.i_subpel_refine;

    /* Init frames. */
    if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && !h->param.rc.b_stat_read )
        h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4;
    else
        h->frames.i_delay = h->param.i_bframe;
    if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
        h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
    i_slicetype_length = h->frames.i_delay;
    h->frames.i_delay += h->i_thread_frames - 1;
    h->frames.i_delay += h->param.i_sync_lookahead;
    h->frames.i_delay += h->param.b_vfr_input;
    h->frames.i_bframe_delay = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 2 : 1) : 0;

    h->frames.i_max_ref0 = h->param.i_frame_reference;
    h->frames.i_max_ref1 = X264_MIN( h->sps->vui.i_num_reorder_frames, h->param.i_frame_reference );
    h->frames.i_max_dpb  = h->sps->vui.i_max_dec_frame_buffering;
    h->frames.b_have_lowres = !h->param.rc.b_stat_read
        && ( h->param.rc.i_rc_method == X264_RC_ABR
          || h->param.rc.i_rc_method == X264_RC_CRF
          || h->param.i_bframe_adaptive
          || h->param.i_scenecut_threshold
          || h->param.rc.b_mb_tree
          || h->param.analyse.i_weighted_pred );
    h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0;
    h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);

    h->frames.i_last_idr =
    h->frames.i_last_keyframe = - h->param.i_keyint_max;
    h->frames.i_input    = 0;
    h->frames.i_largest_pts = h->frames.i_second_largest_pts = -1;
    h->frames.i_poc_last_open_gop = -1;

    CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
    /* Allocate room for max refs plus a few extra just in case. */
    CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + X264_REF_MAX + 4) * sizeof(x264_frame_t *) );
    CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
                        + h->i_thread_frames + 3) * sizeof(x264_frame_t *) );
    if( h->param.analyse.i_weighted_pred > 0 )
        CHECKED_MALLOCZERO( h->frames.blank_unused, h->i_thread_frames * 4 * sizeof(x264_frame_t *) );
    h->i_ref[0] = h->i_ref[1] = 0;
    h->i_cpb_delay = h->i_coded_fields = h->i_disp_fields = 0;
    h->i_prev_duration = ((uint64_t)h->param.i_fps_den * h->sps->vui.i_time_scale) / ((uint64_t)h->param.i_fps_num * h->sps->vui.i_num_units_in_tick);
    h->i_disp_fields_last_frame = -1;
    x264_rdo_init();

    /* init CPU functions */
    x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
    x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
    x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
    x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
    x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
    if( h->param.b_cabac )
        x264_cabac_init( h );
    else
        x264_cavlc_init();
    x264_pixel_init( h->param.cpu, &h->pixf );
    x264_dct_init( h->param.cpu, &h->dctf );
    x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );
    memcpy( &h->zigzagf, PARAM_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) );
    x264_mc_init( h->param.cpu, &h->mc );
    x264_quant_init( h, h->param.cpu, &h->quantf );
    x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
    x264_bitstream_init( h->param.cpu, &h->bsf );
    x264_dct_init_weights();

    mbcmp_init( h );
    chroma_dsp_init( h );

    p = buf + sprintf( buf, "using cpu capabilities:" );
    for( int i = 0; x264_cpu_names[i].flags; i++ )
    {
        if( !strcmp(x264_cpu_names[i].name, "SSE2")
            && h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
            continue;
        if( !strcmp(x264_cpu_names[i].name, "SSE3")
            && (h->param.cpu & X264_CPU_SSSE3 || !(h->param.cpu & X264_CPU_CACHELINE_64)) )
            continue;
        if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
            && (h->param.cpu & X264_CPU_SSE42) )
            continue;
        if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
            && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
            p += sprintf( p, " %s", x264_cpu_names[i].name );
    }
    if( !h->param.cpu )
        p += sprintf( p, " none!" );
    x264_log( h, X264_LOG_INFO, "%s\n", buf );

    float *logs = x264_analyse_prepare_costs( h );
    if( !logs )
        goto fail;
    for( qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
        if( x264_analyse_init_costs( h, logs, qp ) )
            goto fail;
    if( x264_analyse_init_costs( h, logs, X264_LOOKAHEAD_QP ) )
        goto fail;
    x264_free( logs );

    static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 };
    /* Checks for known miscompilation issues. */
    if( h->cost_mv[X264_LOOKAHEAD_QP][2013] != cost_mv_correct[BIT_DEPTH-8] )
    {
        x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
        goto fail;
    }

    /* Must be volatile or else GCC will optimize it out. */
    volatile int temp = 392;
    if( x264_clz( temp ) != 23 )
    {
        x264_log( h, X264_LOG_ERROR, "CLZ test failed: x264 has been miscompiled!\n" );
#if ARCH_X86 || ARCH_X86_64
        x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a-targeted build on a CPU that\n" );
        x264_log( h, X264_LOG_ERROR, "doesn't support it?\n" );
#endif
        goto fail;
    }

    h->out.i_nal = 0;
    h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
        * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
          : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));

    h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4;
    CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size );

    if( h->param.i_threads > 1 &&
        x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
        goto fail;

    h->thread[0] = h;
    for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
        CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );

    for( int i = 0; i < h->param.i_threads; i++ )
    {
        int init_nal_count = h->param.i_slice_count + 3;
        int allocate_threadlocal_data = !h->param.b_sliced_threads || !i;
        if( i > 0 )
            *h->thread[i] = *h;

        if( allocate_threadlocal_data )
        {
            h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
            if( !h->thread[i]->fdec )
                goto fail;
        }
        else
            h->thread[i]->fdec = h->thread[0]->fdec;

        CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
        /* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */
        CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
        h->thread[i]->out.i_nals_allocated = init_nal_count;

        if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
            goto fail;
    }

    if( x264_lookahead_init( h, i_slicetype_length ) )
        goto fail;

    for( int i = 0; i < h->param.i_threads; i++ )
        if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
            goto fail;

    if( x264_ratecontrol_new( h ) < 0 )
        goto fail;

    if( h->param.i_nal_hrd )
    {
        x264_log( h, X264_LOG_DEBUG, "HRD bitrate: %i bits/sec\n", h->sps->vui.hrd.i_bit_rate_unscaled );
        x264_log( h, X264_LOG_DEBUG, "CPB size: %i bits\n", h->sps->vui.hrd.i_cpb_size_unscaled );
    }

    if( h->param.psz_dump_yuv )
    {
        /* create or truncate the reconstructed video file */
        FILE *f = fopen( h->param.psz_dump_yuv, "w" );
        if( !f )
        {
            x264_log( h, X264_LOG_ERROR, "dump_yuv: can't write to %s\n", h->param.psz_dump_yuv );
            goto fail;
        }
        else if( !x264_is_regular_file( f ) )
        {
            x264_log( h, X264_LOG_ERROR, "dump_yuv: incompatible with non-regular file %s\n", h->param.psz_dump_yuv );
            goto fail;
        }
        fclose( f );
    }

    const char *profile = h->sps->i_profile_idc == PROFILE_BASELINE ? "Constrained Baseline" :
                          h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
                          h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
                          h->sps->i_profile_idc == PROFILE_HIGH10 ? (h->sps->b_constraint_set3 == 1 ? "High 10 Intra" : "High 10") :
                          h->sps->i_profile_idc == PROFILE_HIGH422 ? (h->sps->b_constraint_set3 == 1 ? "High 4:2:2 Intra" : "High 4:2:2") :
                          h->sps->b_constraint_set3 == 1 ? "High 4:4:4 Intra" : "High 4:4:4 Predictive";
    char level[4];
    snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
    if( h->sps->i_level_idc == 9 || ( h->sps->i_level_idc == 11 && h->sps->b_constraint_set3 &&
        (h->sps->i_profile_idc >= PROFILE_BASELINE && h->sps->i_profile_idc <= PROFILE_EXTENDED) ) )
        strcpy( level, "1b" );

    if( h->sps->i_profile_idc < PROFILE_HIGH10 )
    {
        x264_log( h, X264_LOG_INFO, "profile %s, level %s\n",
            profile, level );
    }
    else
    {
        static const char * const subsampling[4] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" };
        x264_log( h, X264_LOG_INFO, "profile %s, level %s, %s %d-bit\n",
            profile, level, subsampling[CHROMA_FORMAT], BIT_DEPTH );
    }

    return h;
fail:
    x264_free( h );
    return NULL;
}

/****************************************************************************
 * x264_encoder_reconfig:
 ****************************************************************************/
int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
{
    int rc_reconfig = 0;
    h = h->thread[h->thread[0]->i_thread_phase];
    x264_set_aspect_ratio( h, param, 0 );
#define COPY(var) h->param.var = param->var
    COPY( i_frame_reference ); // but never uses more refs than initially specified
    COPY( i_bframe_bias );
    if( h->param.i_scenecut_threshold )
        COPY( i_scenecut_threshold ); // can't turn it on or off, only vary the threshold
    COPY( b_deblocking_filter );
    COPY( i_deblocking_filter_alphac0 );
    COPY( i_deblocking_filter_beta );
    COPY( i_frame_packing );
    COPY( analyse.inter );
    COPY( analyse.intra );
    COPY( analyse.i_direct_mv_pred );
    /* Scratch buffer prevents me_range from being increased for esa/tesa */
    if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
        COPY( analyse.i_me_range );
    COPY( analyse.i_noise_reduction );
    /* We can't switch out of subme=0 during encoding. */
    if( h->param.analyse.i_subpel_refine )
        COPY( analyse.i_subpel_refine );
    COPY( analyse.i_trellis );
    COPY( analyse.b_chroma_me );
    COPY( analyse.b_dct_decimate );
    COPY( analyse.b_fast_pskip );
    COPY( analyse.b_mixed_references );
    COPY( analyse.f_psy_rd );
    COPY( analyse.f_psy_trellis );
    COPY( crop_rect );
    // can only twiddle these if they were enabled to begin with:
    if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA )
        COPY( analyse.i_me_method );
    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa )
        h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
    if( h->pps->b_transform_8x8_mode )
        COPY( analyse.b_transform_8x8 );
    if( h->frames.i_max_ref1 > 1 )
        COPY( i_bframe_pyramid );
    COPY( i_slice_max_size );
    COPY( i_slice_max_mbs );
    COPY( i_slice_count );
    COPY( b_tff );

    /* VBV can't be turned on if it wasn't on to begin with */
    if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 &&
          param->rc.i_vbv_max_bitrate > 0 &&   param->rc.i_vbv_buffer_size > 0 )
    {
        rc_reconfig |= h->param.rc.i_vbv_max_bitrate != param->rc.i_vbv_max_bitrate;
        rc_reconfig |= h->param.rc.i_vbv_buffer_size != param->rc.i_vbv_buffer_size;
        rc_reconfig |= h->param.rc.i_bitrate != param->rc.i_bitrate;
        COPY( rc.i_vbv_max_bitrate );
        COPY( rc.i_vbv_buffer_size );
        COPY( rc.i_bitrate );
    }
    rc_reconfig |= h->param.rc.f_rf_constant != param->rc.f_rf_constant;
    rc_reconfig |= h->param.rc.f_rf_constant_max != param->rc.f_rf_constant_max;
    COPY( rc.f_rf_constant );
    COPY( rc.f_rf_constant_max );
#undef COPY

    mbcmp_init( h );

    int ret = x264_validate_parameters( h, 0 );

    /* Supported reconfiguration options (1-pass only):
     * vbv-maxrate
     * vbv-bufsize
     * crf
     * bitrate (CBR only) */
    if( !ret && rc_reconfig )
        x264_ratecontrol_init_reconfigurable( h, 0 );

    return ret;
}

/****************************************************************************
 * x264_encoder_parameters:
 ****************************************************************************/
void x264_encoder_parameters( x264_t *h, x264_param_t *param )
{
    memcpy( param, &h->thread[h->i_thread_phase]->param, sizeof(x264_param_t) );
}

/* internal usage */
static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
{
    x264_nal_t *nal = &h->out.nal[h->out.i_nal];

    nal->i_ref_idc        = i_ref_idc;
    nal->i_type           = i_type;
    nal->b_long_startcode = 1;

    nal->i_payload= 0;
    nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
}

/* if number of allocated nals is not enough, re-allocate a larger one. */
static int x264_nal_check_buffer( x264_t *h )
{
    if( h->out.i_nal >= h->out.i_nals_allocated )
    {
        x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) );
        if( !new_out )
            return -1;
        memcpy( new_out, h->out.nal, sizeof(x264_nal_t) * (h->out.i_nals_allocated) );
        x264_free( h->out.nal );
        h->out.nal = new_out;
        h->out.i_nals_allocated *= 2;
    }
    return 0;
}

static int x264_nal_end( x264_t *h )
{
    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
    uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
    nal->i_payload = end - nal->p_payload;
    /* nal_escape_mmx reads past the end of the input.
     * While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */
    memset( end, 0xff, 32 );
    if( h->param.nalu_process )
        h->param.nalu_process( h, nal );
    h->out.i_nal++;

    return x264_nal_check_buffer( h );
}

static int x264_encoder_encapsulate_nals( x264_t *h, int start )
{
    int nal_size = 0, previous_nal_size = 0;

    if( h->param.nalu_process )
    {
        for( int i = start; i < h->out.i_nal; i++ )
            nal_size += h->out.nal[i].i_payload;
        return nal_size;
    }

    for( int i = 0; i < start; i++ )
        previous_nal_size += h->out.nal[i].i_payload;

    for( int i = start; i < h->out.i_nal; i++ )
        nal_size += h->out.nal[i].i_payload;

    /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */
    int necessary_size = nal_size * 3/2 + h->out.i_nal * 4;
    if( h->nal_buffer_size < necessary_size )
    {
        h->nal_buffer_size = necessary_size * 2;
        uint8_t *buf = x264_malloc( h->nal_buffer_size );
        if( !buf )
            return -1;
        if( previous_nal_size )
            memcpy( buf, h->nal_buffer, previous_nal_size );
        x264_free( h->nal_buffer );
        h->nal_buffer = buf;
    }

    uint8_t *nal_buffer = h->nal_buffer + previous_nal_size;

    for( int i = start; i < h->out.i_nal; i++ )
    {
        h->out.nal[i].b_long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
        x264_nal_encode( h, nal_buffer, &h->out.nal[i] );
        nal_buffer += h->out.nal[i].i_payload;
    }

    x264_emms();

    return nal_buffer - (h->nal_buffer + previous_nal_size);
}

/****************************************************************************
 * x264_encoder_headers:
 ****************************************************************************/
int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
{
    int frame_size = 0;
    /* init bitstream context */
    h->out.i_nal = 0;
    bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );

    /* Write SEI, SPS and PPS. */

    /* generate sequence parameters */
    x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
    x264_sps_write( &h->out.bs, h->sps );
    if( x264_nal_end( h ) )
        return -1;

    /* generate picture parameters */
    x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
    x264_pps_write( &h->out.bs, h->sps, h->pps );
    if( x264_nal_end( h ) )
        return -1;

    /* identify ourselves */
    x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
    if( x264_sei_version_write( h, &h->out.bs ) )
        return -1;
    if( x264_nal_end( h ) )
        return -1;

    frame_size = x264_encoder_encapsulate_nals( h, 0 );
    if( frame_size < 0 )
        return -1;

    /* now set output*/
    *pi_nal = h->out.i_nal;
    *pp_nal = &h->out.nal[0];
    h->out.i_nal = 0;

    return frame_size;
}

/* Check to see whether we have chosen a reference list ordering different
 * from the standard's default. */
static inline void x264_reference_check_reorder( x264_t *h )
{
    /* The reorder check doesn't check for missing frames, so just
     * force a reorder if one of the reference list is corrupt. */
    for( int i = 0; h->frames.reference[i]; i++ )
        if( h->frames.reference[i]->b_corrupt )
        {
            h->b_ref_reorder[0] = 1;
            return;
        }
    for( int list = 0; list <= (h->sh.i_type == SLICE_TYPE_B); list++ )
        for( int i = 0; i < h->i_ref[list] - 1; i++ )
        {
            int framenum_diff = h->fref[list][i+1]->i_frame_num - h->fref[list][i]->i_frame_num;
            int poc_diff = h->fref[list][i+1]->i_poc - h->fref[list][i]->i_poc;
            /* P and B-frames use different default orders. */
            if( h->sh.i_type == SLICE_TYPE_P ? framenum_diff > 0 : list == 1 ? poc_diff < 0 : poc_diff > 0 )
            {
                h->b_ref_reorder[list] = 1;
                return;
            }
        }
}

/* return -1 on failure, else return the index of the new reference frame */
int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w )
{
    int i = h->i_ref[0];
    int j = 1;
    x264_frame_t *newframe;
    if( i <= 1 ) /* empty list, definitely can't duplicate frame */
        return -1;

    //Duplication is only used in X264_WEIGHTP_SMART
    if( h->param.analyse.i_weighted_pred != X264_WEIGHTP_SMART )
        return -1;

    /* Duplication is a hack to compensate for crappy rounding in motion compensation.
     * With high bit depth, it's not worth doing, so turn it off except in the case of
     * unweighted dupes. */
    if( BIT_DEPTH > 8 && w != x264_weight_none )
        return -1;

    newframe = x264_frame_pop_blank_unused( h );
    if( !newframe )
        return -1;

    //FIXME: probably don't need to copy everything
    *newframe = *h->fref[0][i_ref];
    newframe->i_reference_count = 1;
    newframe->orig = h->fref[0][i_ref];
    newframe->b_duplicate = 1;
    memcpy( h->fenc->weight[j], w, sizeof(h->fenc->weight[i]) );

    /* shift the frames to make space for the dupe. */
    h->b_ref_reorder[0] = 1;
    if( h->i_ref[0] < X264_REF_MAX )
        ++h->i_ref[0];
    h->fref[0][X264_REF_MAX-1] = NULL;
    x264_frame_unshift( &h->fref[0][j], newframe );

    return j;
}

static void x264_weighted_pred_init( x264_t *h )
{
    /* for now no analysis and set all weights to nothing */
    for( int i_ref = 0; i_ref < h->i_ref[0]; i_ref++ )
        h->fenc->weighted[i_ref] = h->fref[0][i_ref]->filtered[0][0];

    // FIXME: This only supports weighting of one reference frame
    // and duplicates of that frame.
    h->fenc->i_lines_weighted = 0;

    for( int i_ref = 0; i_ref < (h->i_ref[0] << SLICE_MBAFF); i_ref++ )
        for( int i = 0; i < 3; i++ )
            h->sh.weight[i_ref][i].weightfn = NULL;


    if( h->sh.i_type != SLICE_TYPE_P || h->param.analyse.i_weighted_pred <= 0 )
        return;

    int i_padv = PADV << PARAM_INTERLACED;
    int denom = -1;
    int weightplane[2] = { 0, 0 };
    int buffer_next = 0;
    for( int i = 0; i < 3; i++ )
    {
        for( int j = 0; j < h->i_ref[0]; j++ )
        {
            if( h->fenc->weight[j][i].weightfn )
            {
                h->sh.weight[j][i] = h->fenc->weight[j][i];
                // if weight is useless, don't write it to stream
                if( h->sh.weight[j][i].i_scale == 1<<h->sh.weight[j][i].i_denom && h->sh.weight[j][i].i_offset == 0 )
                    h->sh.weight[j][i].weightfn = NULL;
                else
                {
                    if( !weightplane[!!i] )
                    {
                        weightplane[!!i] = 1;
                        h->sh.weight[0][!!i].i_denom = denom = h->sh.weight[j][i].i_denom;
                        assert( x264_clip3( denom, 0, 7 ) == denom );
                    }

                    assert( h->sh.weight[j][i].i_denom == denom );
                    if( !i )
                    {
                        h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH;
                        //scale full resolution frame
                        if( h->param.i_threads == 1 )
                        {
                            pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH;
                            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
                            int stride = h->fenc->i_stride[0];
                            int width = h->fenc->i_width[0] + PADH*2;
                            int height = h->fenc->i_lines[0] + i_padv*2;
                            x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
                            h->fenc->i_lines_weighted = height;
                        }
                    }
                }
            }
        }
    }

    if( weightplane[1] )
        for( int i = 0; i < h->i_ref[0]; i++ )
        {
            if( h->sh.weight[i][1].weightfn && !h->sh.weight[i][2].weightfn )
            {
                h->sh.weight[i][2].i_scale = 1 << h->sh.weight[0][1].i_denom;
                h->sh.weight[i][2].i_offset = 0;
            }
            else if( h->sh.weight[i][2].weightfn && !h->sh.weight[i][1].weightfn )
            {
                h->sh.weight[i][1].i_scale = 1 << h->sh.weight[0][1].i_denom;
                h->sh.weight[i][1].i_offset = 0;
            }
        }

    if( !weightplane[0] )
        h->sh.weight[0][0].i_denom = 0;
    if( !weightplane[1] )
        h->sh.weight[0][1].i_denom = 0;
    h->sh.weight[0][2].i_denom = h->sh.weight[0][1].i_denom;
}

static inline int x264_reference_distance( x264_t *h, x264_frame_t *frame )
{
    if( h->param.i_frame_packing == 5 )
        return abs((h->fenc->i_frame&~1) - (frame->i_frame&~1)) +
                  ((h->fenc->i_frame&1) != (frame->i_frame&1));
    else
        return abs(h->fenc->i_frame - frame->i_frame);
}

static inline void x264_reference_build_list( x264_t *h, int i_poc )
{
    int b_ok;

    /* build ref list 0/1 */
    h->mb.pic.i_fref[0] = h->i_ref[0] = 0;
    h->mb.pic.i_fref[1] = h->i_ref[1] = 0;
    if( h->sh.i_type == SLICE_TYPE_I )
        return;

    for( int i = 0; h->frames.reference[i]; i++ )
    {
        if( h->frames.reference[i]->b_corrupt )
            continue;
        if( h->frames.reference[i]->i_poc < i_poc )
            h->fref[0][h->i_ref[0]++] = h->frames.reference[i];
        else if( h->frames.reference[i]->i_poc > i_poc )
            h->fref[1][h->i_ref[1]++] = h->frames.reference[i];
    }

    /* Order reference lists by distance from the current frame. */
    for( int list = 0; list < 2; list++ )
    {
        h->fref_nearest[list] = h->fref[list][0];
        do
        {
            b_ok = 1;
            for( int i = 0; i < h->i_ref[list] - 1; i++ )
            {
                if( list ? h->fref[list][i+1]->i_poc < h->fref_nearest[list]->i_poc
                         : h->fref[list][i+1]->i_poc > h->fref_nearest[list]->i_poc )
                    h->fref_nearest[list] = h->fref[list][i+1];
                if( x264_reference_distance( h, h->fref[list][i] ) > x264_reference_distance( h, h->fref[list][i+1] ) )
                {
                    XCHG( x264_frame_t*, h->fref[list][i], h->fref[list][i+1] );
                    b_ok = 0;
                    break;
                }
            }
        } while( !b_ok );
    }

    if( h->sh.i_mmco_remove_from_end )
        for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- )
        {
            int diff = h->i_frame_num - h->fref[0][i]->i_frame_num;
            h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc;
            h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff;
        }

    x264_reference_check_reorder( h );

    h->i_ref[1] = X264_MIN( h->i_ref[1], h->frames.i_max_ref1 );
    h->i_ref[0] = X264_MIN( h->i_ref[0], h->frames.i_max_ref0 );
    h->i_ref[0] = X264_MIN( h->i_ref[0], h->param.i_frame_reference ); // if reconfig() has lowered the limit

    /* For Blu-ray compliance, don't reference frames outside of the minigop. */
    if( IS_X264_TYPE_B( h->fenc->i_type ) && h->param.b_bluray_compat )
        h->i_ref[0] = X264_MIN( h->i_ref[0], IS_X264_TYPE_B( h->fref[0][0]->i_type ) + 1 );

    /* add duplicates */
    if( h->fenc->i_type == X264_TYPE_P )
    {
        int idx = -1;
        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
        {
            x264_weight_t w[3];
            w[1].weightfn = w[2].weightfn = NULL;
            if( h->param.rc.b_stat_read )
                x264_ratecontrol_set_weights( h, h->fenc );

            if( !h->fenc->weight[0][0].weightfn )
            {
                h->fenc->weight[0][0].i_denom = 0;
                SET_WEIGHT( w[0], 1, 1, 0, -1 );
                idx = x264_weighted_reference_duplicate( h, 0, w );
            }
            else
            {
                if( h->fenc->weight[0][0].i_scale == 1<<h->fenc->weight[0][0].i_denom )
                {
                    SET_WEIGHT( h->fenc->weight[0][0], 1, 1, 0, h->fenc->weight[0][0].i_offset );
                }
                x264_weighted_reference_duplicate( h, 0, x264_weight_none );
                if( h->fenc->weight[0][0].i_offset > -128 )
                {
                    w[0] = h->fenc->weight[0][0];
                    w[0].i_offset--;
                    h->mc.weight_cache( h, &w[0] );
                    idx = x264_weighted_reference_duplicate( h, 0, w );
                }
            }
        }
        h->mb.ref_blind_dupe = idx;
    }

    assert( h->i_ref[0] + h->i_ref[1] <= X264_REF_MAX );
    h->mb.pic.i_fref[0] = h->i_ref[0];
    h->mb.pic.i_fref[1] = h->i_ref[1];
}

static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
{
    /* mb_y is the mb to be encoded next, not the mb to be filtered here */
    int b_hpel = h->fdec->b_kept_as_ref;
    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
    int b_end = mb_y == h->i_threadslice_end;
    int b_measure_quality = 1;
    int min_y = mb_y - (1 << SLICE_MBAFF);
    int b_start = min_y == h->i_threadslice_start;
    /* Even in interlaced mode, deblocking never modifies more than 4 pixels
     * above each MB, as bS=4 doesn't happen for the top of interlaced mbpairs. */
    int minpix_y = min_y*16 - 4 * !b_start;
    int maxpix_y = mb_y*16 - 4 * !b_end;
    b_deblock &= b_hpel || h->param.psz_dump_yuv;
    if( h->param.b_sliced_threads && b_start && min_y && !b_inloop )
    {
        b_deblock = 0;         /* We already deblocked on the inloop pass. */
        b_measure_quality = 0; /* We already measured quality on the inloop pass. */
    }
    if( mb_y & SLICE_MBAFF )
        return;
    if( min_y < h->i_threadslice_start )
        return;

    if( b_deblock )
        for( int y = min_y; y < mb_y; y += (1 << SLICE_MBAFF) )
            x264_frame_deblock_row( h, y );

    /* FIXME: Prediction requires different borders for interlaced/progressive mc,
     * but the actual image data is equivalent. For now, maintain this
     * consistency by copying deblocked pixels between planes. */
    if( PARAM_INTERLACED )
        for( int p = 0; p < h->fdec->i_plane; p++ )
            for( int i = minpix_y>>(CHROMA_V_SHIFT && p); i < maxpix_y>>(CHROMA_V_SHIFT && p); i++ )
                memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
                        h->fdec->plane[p]     + i*h->fdec->i_stride[p],
                        h->mb.i_mb_width*16*sizeof(pixel) );

    if( b_hpel )
    {
        int end = mb_y == h->mb.i_mb_height;
        x264_frame_expand_border( h, h->fdec, min_y, end );
        if( h->param.analyse.i_subpel_refine )
        {
            x264_frame_filter( h, h->fdec, min_y, end );
            x264_frame_expand_border_filtered( h, h->fdec, min_y, end );
        }
    }

    if( SLICE_MBAFF )
        for( int i = 0; i < 3; i++ )
        {
            XCHG( pixel *, h->intra_border_backup[0][i], h->intra_border_backup[3][i] );
            XCHG( pixel *, h->intra_border_backup[1][i], h->intra_border_backup[4][i] );
        }

    if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
        x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << SLICE_MBAFF)) );

    if( b_measure_quality )
    {
        maxpix_y = X264_MIN( maxpix_y, h->param.i_height );
        if( h->param.analyse.b_psnr )
        {
            for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
                h->stat.frame.i_ssd[p] += x264_pixel_ssd_wxh( &h->pixf,
                    h->fdec->plane[p] + minpix_y * h->fdec->i_stride[p], h->fdec->i_stride[p],
                    h->fenc->plane[p] + minpix_y * h->fenc->i_stride[p], h->fenc->i_stride[p],
                    h->param.i_width, maxpix_y-minpix_y );
            if( !CHROMA444 )
            {
                uint64_t ssd_u, ssd_v;
                int v_shift = CHROMA_V_SHIFT;
                x264_pixel_ssd_nv12( &h->pixf,
                    h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1],
                    h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1],
                    h->param.i_width>>1, (maxpix_y-minpix_y)>>v_shift, &ssd_u, &ssd_v );
                h->stat.frame.i_ssd[1] += ssd_u;
                h->stat.frame.i_ssd[2] += ssd_v;
            }
        }

        if( h->param.analyse.b_ssim )
        {
            int ssim_cnt;
            x264_emms();
            /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
             * and overlap by 4 */
            minpix_y += b_start ? 2 : -6;
            h->stat.frame.f_ssim +=
                x264_pixel_ssim_wxh( &h->pixf,
                    h->fdec->plane[0] + 2+minpix_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
                    h->fenc->plane[0] + 2+minpix_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
                    h->param.i_width-2, maxpix_y-minpix_y, h->scratch_buffer, &ssim_cnt );
            h->stat.frame.i_ssim_cnt += ssim_cnt;
        }
    }
}

static inline int x264_reference_update( x264_t *h )
{
    if( !h->fdec->b_kept_as_ref )
    {
        if( h->i_thread_frames > 1 )
        {
            x264_frame_push_unused( h, h->fdec );
            h->fdec = x264_frame_pop_unused( h, 1 );
            if( !h->fdec )
                return -1;
        }
        return 0;
    }

    /* apply mmco from previous frame. */
    for( int i = 0; i < h->sh.i_mmco_command_count; i++ )
        for( int j = 0; h->frames.reference[j]; j++ )
            if( h->frames.reference[j]->i_poc == h->sh.mmco[i].i_poc )
                x264_frame_push_unused( h, x264_frame_shift( &h->frames.reference[j] ) );

    /* move frame in the buffer */
    x264_frame_push( h->frames.reference, h->fdec );
    if( h->frames.reference[h->sps->i_num_ref_frames] )
        x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) );
    h->fdec = x264_frame_pop_unused( h, 1 );
    if( !h->fdec )
        return -1;
    return 0;
}

static inline void x264_reference_reset( x264_t *h )
{
    while( h->frames.reference[0] )
        x264_frame_push_unused( h, x264_frame_pop( h->frames.reference ) );
    h->fdec->i_poc =
    h->fenc->i_poc = 0;
}

static inline void x264_reference_hierarchy_reset( x264_t *h )
{
    int ref;
    int b_hasdelayframe = 0;

    /* look for delay frames -- chain must only contain frames that are disposable */
    for( int i = 0; h->frames.current[i] && IS_DISPOSABLE( h->frames.current[i]->i_type ); i++ )
        b_hasdelayframe |= h->frames.current[i]->i_coded
                        != h->frames.current[i]->i_frame + h->sps->vui.i_num_reorder_frames;

    /* This function must handle b-pyramid and clear frames for open-gop */
    if( h->param.i_bframe_pyramid != X264_B_PYRAMID_STRICT && !b_hasdelayframe && h->frames.i_poc_last_open_gop == -1 )
        return;

    /* Remove last BREF. There will never be old BREFs in the
     * dpb during a BREF decode when pyramid == STRICT */
    for( ref = 0; h->frames.reference[ref]; ref++ )
    {
        if( ( h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT
            && h->frames.reference[ref]->i_type == X264_TYPE_BREF )
            || ( h->frames.reference[ref]->i_poc < h->frames.i_poc_last_open_gop
            && h->sh.i_type != SLICE_TYPE_B ) )
        {
            int diff = h->i_frame_num - h->frames.reference[ref]->i_frame_num;
            h->sh.mmco[h->sh.i_mmco_command_count].i_difference_of_pic_nums = diff;
            h->sh.mmco[h->sh.i_mmco_command_count++].i_poc = h->frames.reference[ref]->i_poc;
            x264_frame_push_unused( h, x264_frame_shift( &h->frames.reference[ref] ) );
            h->b_ref_reorder[0] = 1;
            ref--;
        }
    }

    /* Prepare room in the dpb for the delayed display time of the later b-frame's */
    if( h->param.i_bframe_pyramid )
        h->sh.i_mmco_remove_from_end = X264_MAX( ref + 2 - h->frames.i_max_dpb, 0 );
}

static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
{
    /* ------------------------ Create slice header  ----------------------- */
    if( i_nal_type == NAL_SLICE_IDR )
    {
        x264_slice_header_init( h, &h->sh, h->sps, h->pps, h->i_idr_pic_id, h->i_frame_num, i_global_qp );

        /* alternate id */
        h->i_idr_pic_id ^= 1;
    }
    else
    {
        x264_slice_header_init( h, &h->sh, h->sps, h->pps, -1, h->i_frame_num, i_global_qp );

        h->sh.i_num_ref_idx_l0_active = h->i_ref[0] <= 0 ? 1 : h->i_ref[0];
        h->sh.i_num_ref_idx_l1_active = h->i_ref[1] <= 0 ? 1 : h->i_ref[1];
        if( h->sh.i_num_ref_idx_l0_active != h->pps->i_num_ref_idx_l0_default_active ||
            (h->sh.i_type == SLICE_TYPE_B && h->sh.i_num_ref_idx_l1_active != h->pps->i_num_ref_idx_l1_default_active) )
        {
            h->sh.b_num_ref_idx_override = 1;
        }
    }

    if( h->fenc->i_type == X264_TYPE_BREF && h->param.b_bluray_compat && h->sh.i_mmco_command_count )
    {
        h->b_sh_backup = 1;
        h->sh_backup = h->sh;
    }

    h->fdec->i_frame_num = h->sh.i_frame_num;

    if( h->sps->i_poc_type == 0 )
    {
        h->sh.i_poc = h->fdec->i_poc;
        if( PARAM_INTERLACED )
        {
            h->sh.i_delta_poc_bottom = h->param.b_tff ? 1 : -1;
            h->sh.i_poc += h->sh.i_delta_poc_bottom == -1;
        }
        else
            h->sh.i_delta_poc_bottom = 0;
        h->fdec->i_delta_poc[0] = h->sh.i_delta_poc_bottom == -1;
        h->fdec->i_delta_poc[1] = h->sh.i_delta_poc_bottom ==  1;
    }
    else
    {
        /* Nothing to do ? */
    }

    x264_macroblock_slice_init( h );
}

static int x264_slice_write( x264_t *h )
{
    int i_skip;
    int mb_xy, i_mb_x, i_mb_y;
    int i_skip_bak = 0; /* Shut up GCC. */
    bs_t UNINIT(bs_bak);
    x264_cabac_t cabac_bak;
    uint8_t cabac_prevbyte_bak = 0; /* Shut up GCC. */
    int mv_bits_bak = 0;
    int tex_bits_bak = 0;
    /* NALUs other than the first use a 3-byte startcode.
     * Add one extra byte for the rbsp, and one more for the final CABAC putbyte.
     * Then add an extra 5 bytes just in case, to account for random NAL escapes and
     * other inaccuracies. */
    int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 1 + h->param.b_cabac + 5;
    int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : 0;
    int back_up_bitstream = slice_max_size || (!h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH);
    int starting_bits = bs_pos(&h->out.bs);
    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
    int b_hpel = h->fdec->b_kept_as_ref;
    uint8_t *last_emu_check;
    b_deblock &= b_hpel || h->param.psz_dump_yuv;
    bs_realign( &h->out.bs );

    /* Slice */
    x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
    h->out.nal[h->out.i_nal].i_first_mb = h->sh.i_first_mb;

    /* Slice header */
    x264_macroblock_thread_init( h );

    /* If this isn't the first slice in the threadslice, set the slice QP
     * equal to the last QP in the previous slice for more accurate
     * CABAC initialization. */
    if( h->sh.i_first_mb != h->i_threadslice_start * h->mb.i_mb_width )
    {
        h->sh.i_qp = h->mb.i_last_qp;
        h->sh.i_qp_delta = h->sh.i_qp - h->pps->i_pic_init_qp;
    }

    x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
    if( h->param.b_cabac )
    {
        /* alignment needed */
        bs_align_1( &h->out.bs );

        /* init cabac */
        x264_cabac_context_init( h, &h->cabac, h->sh.i_type, x264_clip3( h->sh.i_qp-QP_BD_OFFSET, 0, 51 ), h->sh.i_cabac_init_idc );
        x264_cabac_encode_init ( &h->cabac, h->out.bs.p, h->out.bs.p_end );
        last_emu_check = h->cabac.p;
    }
    else
        last_emu_check = h->out.bs.p;
    h->mb.i_last_qp = h->sh.i_qp;
    h->mb.i_last_dqp = 0;
    h->mb.field_decoding_flag = 0;

    i_mb_y = h->sh.i_first_mb / h->mb.i_mb_width;
    i_mb_x = h->sh.i_first_mb % h->mb.i_mb_width;
    i_skip = 0;

    while( 1 )
    {
        mb_xy = i_mb_x + i_mb_y * h->mb.i_mb_width;
        int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);

        if( !(i_mb_y & SLICE_MBAFF) )
        {
            if( x264_bitstream_check_buffer( h ) )
                return -1;

            if( back_up_bitstream )
            {
                mv_bits_bak = h->stat.frame.i_mv_bits;
                tex_bits_bak = h->stat.frame.i_tex_bits;
                /* We don't need the contexts because flushing the CABAC encoder has no context
                 * dependency and macroblocks are only re-encoded in the case where a slice is
                 * ended (and thus the content of all contexts are thrown away). */
                if( h->param.b_cabac )
                {
                    memcpy( &cabac_bak, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) );
                    /* x264's CABAC writer modifies the previous byte during carry, so it has to be
                     * backed up. */
                    cabac_prevbyte_bak = h->cabac.p[-1];
                }
                else
                {
                    bs_bak = h->out.bs;
                    i_skip_bak = i_skip;
                }
            }
        }

        if( i_mb_x == 0 && !h->mb.b_reencode_mb )
            x264_fdec_filter_row( h, i_mb_y, 1 );

        if( PARAM_INTERLACED )
        {
            if( h->mb.b_adaptive_mbaff )
            {
                if( !(i_mb_y&1) )
                {
                    /* FIXME: VSAD is fast but fairly poor at choosing the best interlace type. */
                    h->mb.b_interlaced = x264_field_vsad( h, i_mb_x, i_mb_y );
                    memcpy( &h->zigzagf, MB_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) );
                    if( !MB_INTERLACED && (i_mb_y+2) == h->mb.i_mb_height )
                        x264_expand_border_mbpair( h, i_mb_x, i_mb_y );
                }
            }
            h->mb.field[mb_xy] = MB_INTERLACED;
        }

        /* load cache */
        if( SLICE_MBAFF )
            x264_macroblock_cache_load_interlaced( h, i_mb_x, i_mb_y );
        else
            x264_macroblock_cache_load_progressive( h, i_mb_x, i_mb_y );

        x264_macroblock_analyse( h );

        /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
reencode:
        x264_macroblock_encode( h );

        if( h->param.b_cabac )
        {
            if( mb_xy > h->sh.i_first_mb && !(SLICE_MBAFF && (i_mb_y&1)) )
                x264_cabac_encode_terminal( &h->cabac );

            if( IS_SKIP( h->mb.i_type ) )
                x264_cabac_mb_skip( h, 1 );
            else
            {
                if( h->sh.i_type != SLICE_TYPE_I )
                    x264_cabac_mb_skip( h, 0 );
                x264_macroblock_write_cabac( h, &h->cabac );
            }
        }
        else
        {
            if( IS_SKIP( h->mb.i_type ) )
                i_skip++;
            else
            {
                if( h->sh.i_type != SLICE_TYPE_I )
                {
                    bs_write_ue( &h->out.bs, i_skip );  /* skip run */
                    i_skip = 0;
                }
                x264_macroblock_write_cavlc( h );
                /* If there was a CAVLC level code overflow, try again at a higher QP. */
                if( h->mb.b_overflow )
                {
                    h->mb.i_chroma_qp = h->chroma_qp_table[++h->mb.i_qp];
                    h->mb.i_skip_intra = 0;
                    h->mb.b_skip_mc = 0;
                    h->mb.b_overflow = 0;
                    h->out.bs = bs_bak;
                    i_skip = i_skip_bak;
                    h->stat.frame.i_mv_bits = mv_bits_bak;
                    h->stat.frame.i_tex_bits = tex_bits_bak;
                    goto reencode;
                }
            }
        }

        int total_bits = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
        int mb_size = total_bits - mb_spos;

        if( slice_max_size )
        {
            /* Count the skip run, just in case. */
            if( !h->param.b_cabac )
                total_bits += bs_size_ue_big( i_skip );
            /* Check for escape bytes. */
            uint8_t *end = h->param.b_cabac ? h->cabac.p : h->out.bs.p;
            for( ; last_emu_check < end - 2; last_emu_check++ )
                if( last_emu_check[0] == 0 && last_emu_check[1] == 0 && last_emu_check[2] <= 3 )
                {
                    slice_max_size -= 8;
                    last_emu_check++;
                }
            /* We'll just re-encode this last macroblock if we go over the max slice size. */
            if( total_bits - starting_bits > slice_max_size && !h->mb.b_reencode_mb )
            {
                if( mb_xy != h->sh.i_first_mb )
                {
                    h->stat.frame.i_mv_bits = mv_bits_bak;
                    h->stat.frame.i_tex_bits = tex_bits_bak;
                    if( h->param.b_cabac )
                    {
                        memcpy( &h->cabac, &cabac_bak, offsetof(x264_cabac_t, f8_bits_encoded) );
                        h->cabac.p[-1] = cabac_prevbyte_bak;
                    }
                    else
                    {
                        h->out.bs = bs_bak;
                        i_skip = i_skip_bak;
                    }
                    h->mb.b_reencode_mb = 1;
                    if( SLICE_MBAFF )
                    {
                        // set to bottom of previous mbpair
                        if( i_mb_x )
                            h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride*(!(i_mb_y&1));
                        else
                            h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))*h->mb.i_mb_stride + h->mb.i_mb_width - 1;
                    }
                    else
                        h->sh.i_last_mb = mb_xy-1;
                    break;
                }
                else
                {
                    h->sh.i_last_mb = mb_xy;
                    h->mb.b_reencode_mb = 0;
                }
            }
            else
                h->mb.b_reencode_mb = 0;
        }

#if HAVE_VISUALIZE
        if( h->param.b_visualize )
            x264_visualize_mb( h );
#endif

        /* save cache */
        x264_macroblock_cache_save( h );

        /* accumulate mb stats */
        h->stat.frame.i_mb_count[h->mb.i_type]++;

        int b_intra = IS_INTRA( h->mb.i_type );
        int b_skip = IS_SKIP( h->mb.i_type );
        if( h->param.i_log_level >= X264_LOG_INFO || h->param.rc.b_stat_write )
        {
            if( !b_intra && !b_skip && !IS_DIRECT( h->mb.i_type ) )
            {
                if( h->mb.i_partition != D_8x8 )
                        h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
                    else
                        for( int i = 0; i < 4; i++ )
                            h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
                if( h->param.i_frame_reference > 1 )
                    for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ )
                        for( int i = 0; i < 4; i++ )
                        {
                            int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ];
                            if( i_ref >= 0 )
                                h->stat.frame.i_mb_count_ref[i_list][i_ref] ++;
                        }
            }
        }

        if( h->param.i_log_level >= X264_LOG_INFO )
        {
            if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
            {
                if( CHROMA444 )
                {
                    for( int i = 0; i < 4; i++ )
                        if( h->mb.i_cbp_luma & (1 << i) )
                            for( int p = 0; p < 3; p++ )
                            {
                                int s8 = i*4+p*16;
                                int nnz8x8 = M16( &h->mb.cache.non_zero_count[x264_scan8[s8]+0] )
                                           | M16( &h->mb.cache.non_zero_count[x264_scan8[s8]+8] );
                                h->stat.frame.i_mb_cbp[!b_intra + p*2] += !!nnz8x8;
                            }
                }
                else
                {
                    int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
                               + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
                    h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
                    h->stat.frame.i_mb_cbp[!b_intra + 2] += !!h->mb.i_cbp_chroma;
                    h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma >> 1;
                }
            }
            if( h->mb.i_cbp_luma && !b_intra )
            {
                h->stat.frame.i_mb_count_8x8dct[0] ++;
                h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8;
            }
            if( b_intra && h->mb.i_type != I_PCM )
            {
                if( h->mb.i_type == I_16x16 )
                    h->stat.frame.i_mb_pred_mode[0][h->mb.i_intra16x16_pred_mode]++;
                else if( h->mb.i_type == I_8x8 )
                    for( int i = 0; i < 16; i += 4 )
                        h->stat.frame.i_mb_pred_mode[1][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
                else //if( h->mb.i_type == I_4x4 )
                    for( int i = 0; i < 16; i++ )
                        h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
                h->stat.frame.i_mb_pred_mode[3][x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]]++;
            }
            h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED;
        }

        /* calculate deblock strength values (actual deblocking is done per-row along with hpel) */
        if( b_deblock )
            x264_macroblock_deblock_strength( h );

        x264_ratecontrol_mb( h, mb_size );

        if( mb_xy == h->sh.i_last_mb )
            break;

        if( SLICE_MBAFF )
        {
            i_mb_x += i_mb_y & 1;
            i_mb_y ^= i_mb_x < h->mb.i_mb_width;
        }
        else
            i_mb_x++;
        if( i_mb_x == h->mb.i_mb_width )
        {
            i_mb_y++;
            i_mb_x = 0;
        }
    }
    h->out.nal[h->out.i_nal].i_last_mb = h->sh.i_last_mb;

    if( h->param.b_cabac )
    {
        x264_cabac_encode_flush( h, &h->cabac );
        h->out.bs.p = h->cabac.p;
    }
    else
    {
        if( i_skip > 0 )
            bs_write_ue( &h->out.bs, i_skip );  /* last skip run */
        /* rbsp_slice_trailing_bits */
        bs_rbsp_trailing( &h->out.bs );
        bs_flush( &h->out.bs );
    }
    if( x264_nal_end( h ) )
        return -1;

    if( h->sh.i_last_mb == (h->i_threadslice_end * h->mb.i_mb_width - 1) )
    {
        h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
                                  + (h->out.i_nal*NALU_OVERHEAD * 8)
                                  - h->stat.frame.i_tex_bits
                                  - h->stat.frame.i_mv_bits;
        x264_fdec_filter_row( h, h->i_threadslice_end, 1 );
    }

    return 0;
}

static void x264_thread_sync_context( x264_t *dst, x264_t *src )
{
    if( dst == src )
        return;

    // reference counting
    for( x264_frame_t **f = src->frames.reference; *f; f++ )
        (*f)->i_reference_count++;
    for( x264_frame_t **f = dst->frames.reference; *f; f++ )
        x264_frame_push_unused( src, *f );
    src->fdec->i_reference_count++;
    x264_frame_push_unused( src, dst->fdec );

    // copy everything except the per-thread pointers and the constants.
    memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) );
    dst->param = src->param;
    dst->stat = src->stat;
    dst->pixf = src->pixf;
}

static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
{
    if( dst == src )
        return;
    memcpy( &dst->stat.i_frame_count, &src->stat.i_frame_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
}

static void *x264_slices_write( x264_t *h )
{
    int i_slice_num = 0;
    int last_thread_mb = h->sh.i_last_mb;

#if HAVE_VISUALIZE
    if( h->param.b_visualize )
        if( x264_visualize_init( h ) )
            return (void *)-1;
#endif

    /* init stats */
    memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
    h->mb.b_reencode_mb = 0;
    while( h->sh.i_first_mb + SLICE_MBAFF*h->mb.i_mb_stride <= last_thread_mb )
    {
        h->sh.i_last_mb = last_thread_mb;
        if( h->param.i_slice_max_mbs )
        {
            if( SLICE_MBAFF )
            {
                // convert first to mbaff form, add slice-max-mbs, then convert back to normal form
                int last_mbaff = 2*(h->sh.i_first_mb % h->mb.i_mb_width)
                    + h->mb.i_mb_width*(h->sh.i_first_mb / h->mb.i_mb_width)
                    + h->param.i_slice_max_mbs - 1;
                int last_x = (last_mbaff % (2*h->mb.i_mb_width))/2;
                int last_y = (last_mbaff / (2*h->mb.i_mb_width))*2 + 1;
                h->sh.i_last_mb = last_x + h->mb.i_mb_stride*last_y;
            }
            else
                h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
        }
        else if( h->param.i_slice_count && !h->param.b_sliced_threads )
        {
            int height = h->mb.i_mb_height >> PARAM_INTERLACED;
            int width = h->mb.i_mb_width << PARAM_INTERLACED;
            i_slice_num++;
            h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
        }
        h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
        if( x264_stack_align( x264_slice_write, h ) )
            return (void *)-1;
        h->sh.i_first_mb = h->sh.i_last_mb + 1;
        // if i_first_mb is not the last mb in a row then go to the next mb in MBAFF order
        if( SLICE_MBAFF && h->sh.i_first_mb % h->mb.i_mb_width )
            h->sh.i_first_mb -= h->mb.i_mb_stride;
    }

#if HAVE_VISUALIZE
    if( h->param.b_visualize )
    {
        x264_visualize_show( h );
        x264_visualize_close( h );
    }
#endif

    return (void *)0;
}

static int x264_threaded_slices_write( x264_t *h )
{
    /* set first/last mb and sync contexts */
    for( int i = 0; i < h->param.i_threads; i++ )
    {
        x264_t *t = h->thread[i];
        if( i )
        {
            t->param = h->param;
            memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) );
        }
        int height = h->mb.i_mb_height >> PARAM_INTERLACED;
        t->i_threadslice_start = ((height *  i    + h->param.i_slice_count/2) / h->param.i_threads) << PARAM_INTERLACED;
        t->i_threadslice_end   = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << PARAM_INTERLACED;
        t->sh.i_first_mb = t->i_threadslice_start * h->mb.i_mb_width;
        t->sh.i_last_mb  =   t->i_threadslice_end * h->mb.i_mb_width - 1;
    }

    x264_stack_align( x264_analyse_weight_frame, h, h->mb.i_mb_height*16 + 16 );

    x264_threads_distribute_ratecontrol( h );

    /* dispatch */
    for( int i = 0; i < h->param.i_threads; i++ )
    {
        x264_threadpool_run( h->threadpool, (void*)x264_slices_write, h->thread[i] );
        h->thread[i]->b_thread_active = 1;
    }
    for( int i = 0; i < h->param.i_threads; i++ )
    {
        h->thread[i]->b_thread_active = 0;
        if( (intptr_t)x264_threadpool_wait( h->threadpool, h->thread[i] ) )
            return -1;
    }

    /* Go back and fix up the hpel on the borders between slices. */
    for( int i = 1; i < h->param.i_threads; i++ )
    {
        x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );
        if( SLICE_MBAFF )
            x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 2, 0 );
    }

    x264_threads_merge_ratecontrol( h );

    for( int i = 1; i < h->param.i_threads; i++ )
    {
        x264_t *t = h->thread[i];
        for( int j = 0; j < t->out.i_nal; j++ )
        {
            h->out.nal[h->out.i_nal] = t->out.nal[j];
            h->out.i_nal++;
            x264_nal_check_buffer( h );
        }
        /* All entries in stat.frame are ints except for ssd/ssim. */
        for( int j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
            ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
        for( int j = 0; j < 3; j++ )
            h->stat.frame.i_ssd[j] += t->stat.frame.i_ssd[j];
        h->stat.frame.f_ssim += t->stat.frame.f_ssim;
        h->stat.frame.i_ssim_cnt += t->stat.frame.i_ssim_cnt;
    }

    return 0;
}

void x264_encoder_intra_refresh( x264_t *h )
{
    h = h->thread[h->i_thread_phase];
    h->b_queued_intra_refresh = 1;
}

int x264_encoder_invalidate_reference( x264_t *h, int64_t pts )
{
    if( h->param.i_bframe )
    {
        x264_log( h, X264_LOG_ERROR, "x264_encoder_invalidate_reference is not supported with B-frames enabled\n" );
        return -1;
    }
    if( h->param.b_intra_refresh )
    {
        x264_log( h, X264_LOG_ERROR, "x264_encoder_invalidate_reference is not supported with intra refresh enabled\n" );
        return -1;
    }
    h = h->thread[h->i_thread_phase];
    if( pts >= h->i_last_idr_pts )
    {
        for( int i = 0; h->frames.reference[i]; i++ )
            if( pts <= h->frames.reference[i]->i_pts )
                h->frames.reference[i]->b_corrupt = 1;
        if( pts <= h->fdec->i_pts )
            h->fdec->b_corrupt = 1;
    }
    return 0;
}

/****************************************************************************
 * x264_encoder_encode:
 *  XXX: i_poc   : is the poc of the current given picture
 *       i_frame : is the number of the frame being coded
 *  ex:  type frame poc
 *       I      0   2*0
 *       P      1   2*3
 *       B      2   2*1
 *       B      3   2*2
 *       P      4   2*6
 *       B      5   2*4
 *       B      6   2*5
 ****************************************************************************/
int     x264_encoder_encode( x264_t *h,
                             x264_nal_t **pp_nal, int *pi_nal,
                             x264_picture_t *pic_in,
                             x264_picture_t *pic_out )
{
    x264_t *thread_current, *thread_prev, *thread_oldest;
    int i_nal_type, i_nal_ref_idc, i_global_qp;
    int overhead = NALU_OVERHEAD;

    if( h->i_thread_frames > 1 )
    {
        thread_prev    = h->thread[ h->i_thread_phase ];
        h->i_thread_phase = (h->i_thread_phase + 1) % h->i_thread_frames;
        thread_current = h->thread[ h->i_thread_phase ];
        thread_oldest  = h->thread[ (h->i_thread_phase + 1) % h->i_thread_frames ];
        x264_thread_sync_context( thread_current, thread_prev );
        x264_thread_sync_ratecontrol( thread_current, thread_prev, thread_oldest );
        h = thread_current;
    }
    else
    {
        thread_current =
        thread_oldest  = h;
    }
#if HAVE_MMX
    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
        x264_cpu_mask_misalign_sse();
#endif

    // ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
    if( x264_reference_update( h ) )
        return -1;
    h->fdec->i_lines_completed = -1;

    /* no data out */
    *pi_nal = 0;
    *pp_nal = NULL;

    /* ------------------- Setup new frame from picture -------------------- */
    if( pic_in != NULL )
    {
        /* 1: Copy the picture to a frame and move it to a buffer */
        x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
        if( !fenc )
            return -1;

        if( x264_frame_copy_picture( h, fenc, pic_in ) < 0 )
            return -1;

        if( h->param.i_width != 16 * h->mb.i_mb_width ||
            h->param.i_height != 16 * h->mb.i_mb_height )
            x264_frame_expand_border_mod16( h, fenc );

        fenc->i_frame = h->frames.i_input++;

        if( fenc->i_frame == 0 )
            h->frames.i_first_pts = fenc->i_pts;
        if( h->frames.i_bframe_delay && fenc->i_frame == h->frames.i_bframe_delay )
            h->frames.i_bframe_delay_time = fenc->i_pts - h->frames.i_first_pts;

        if( h->param.b_vfr_input && fenc->i_pts <= h->frames.i_largest_pts )
            x264_log( h, X264_LOG_WARNING, "non-strictly-monotonic PTS\n" );

        h->frames.i_second_largest_pts = h->frames.i_largest_pts;
        h->frames.i_largest_pts = fenc->i_pts;

        if( (fenc->i_pic_struct < PIC_STRUCT_AUTO) || (fenc->i_pic_struct > PIC_STRUCT_TRIPLE) )
            fenc->i_pic_struct = PIC_STRUCT_AUTO;

        if( fenc->i_pic_struct == PIC_STRUCT_AUTO )
        {
#if HAVE_INTERLACED
            int b_interlaced = fenc->param ? fenc->param->b_interlaced : h->param.b_interlaced;
#else
            int b_interlaced = 0;
#endif
            if( b_interlaced )
            {
                int b_tff = fenc->param ? fenc->param->b_tff : h->param.b_tff;
                fenc->i_pic_struct = b_tff ? PIC_STRUCT_TOP_BOTTOM : PIC_STRUCT_BOTTOM_TOP;
            }
            else
                fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
        }

        if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
        {
            if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
                return -1;
        }
        else
            x264_stack_align( x264_adaptive_quant_frame, h, fenc, pic_in->prop.quant_offsets );

        if( pic_in->prop.quant_offsets_free )
            pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );

        if( h->frames.b_have_lowres )
            x264_frame_init_lowres( h, fenc );

        /* 2: Place the frame into the queue for its slice type decision */
        x264_lookahead_put_frame( h, fenc );

        if( h->frames.i_input <= h->frames.i_delay + 1 - h->i_thread_frames )
        {
            /* Nothing yet to encode, waiting for filling of buffers */
            pic_out->i_type = X264_TYPE_AUTO;
            return 0;
        }
    }
    else
    {
        /* signal kills for lookahead thread */
        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
        h->lookahead->b_exit_thread = 1;
        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
        x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
    }

    h->i_frame++;
    /* 3: The picture is analyzed in the lookahead */
    if( !h->frames.current[0] )
        x264_lookahead_get_frames( h );

    if( !h->frames.current[0] && x264_lookahead_is_empty( h ) )
        return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );

    /* ------------------- Get frame to be encoded ------------------------- */
    /* 4: get picture to encode */
    h->fenc = x264_frame_shift( h->frames.current );
    if( h->i_frame == h->i_thread_frames - 1 )
        h->i_reordered_pts_delay = h->fenc->i_reordered_pts;
    if( h->fenc->param )
    {
        x264_encoder_reconfig( h, h->fenc->param );
        if( h->fenc->param->param_free )
            h->fenc->param->param_free( h->fenc->param );
    }

    if( !IS_X264_TYPE_I( h->fenc->i_type ) )
    {
        int valid_refs_left = 0;
        for( int i = 0; h->frames.reference[i]; i++ )
            if( !h->frames.reference[i]->b_corrupt )
                valid_refs_left++;
        /* No valid reference frames left: force an IDR. */
        if( !valid_refs_left )
        {
            h->fenc->b_keyframe = 1;
            h->fenc->i_type = X264_TYPE_IDR;
        }
    }

    if( h->fenc->b_keyframe )
    {
        h->frames.i_last_keyframe = h->fenc->i_frame;
        if( h->fenc->i_type == X264_TYPE_IDR )
        {
            h->i_frame_num = 0;
            h->frames.i_last_idr = h->fenc->i_frame;
        }
    }
    h->sh.i_mmco_command_count =
    h->sh.i_mmco_remove_from_end = 0;
    h->b_ref_reorder[0] =
    h->b_ref_reorder[1] = 0;
    h->fdec->i_poc =
    h->fenc->i_poc = 2 * ( h->fenc->i_frame - X264_MAX( h->frames.i_last_idr, 0 ) );

    /* ------------------- Setup frame context ----------------------------- */
    /* 5: Init data dependent of frame type */
    if( h->fenc->i_type == X264_TYPE_IDR )
    {
        /* reset ref pictures */
        i_nal_type    = NAL_SLICE_IDR;
        i_nal_ref_idc = NAL_PRIORITY_HIGHEST;
        h->sh.i_type = SLICE_TYPE_I;
        x264_reference_reset( h );
        h->frames.i_poc_last_open_gop = -1;
    }
    else if( h->fenc->i_type == X264_TYPE_I )
    {
        i_nal_type    = NAL_SLICE;
        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
        h->sh.i_type = SLICE_TYPE_I;
        x264_reference_hierarchy_reset( h );
        if( h->param.b_open_gop )
            h->frames.i_poc_last_open_gop = h->fenc->b_keyframe ? h->fenc->i_poc : -1;
    }
    else if( h->fenc->i_type == X264_TYPE_P )
    {
        i_nal_type    = NAL_SLICE;
        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
        h->sh.i_type = SLICE_TYPE_P;
        x264_reference_hierarchy_reset( h );
        h->frames.i_poc_last_open_gop = -1;
    }
    else if( h->fenc->i_type == X264_TYPE_BREF )
    {
        i_nal_type    = NAL_SLICE;
        i_nal_ref_idc = h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT ? NAL_PRIORITY_LOW : NAL_PRIORITY_HIGH;
        h->sh.i_type = SLICE_TYPE_B;
        x264_reference_hierarchy_reset( h );
    }
    else    /* B frame */
    {
        i_nal_type    = NAL_SLICE;
        i_nal_ref_idc = NAL_PRIORITY_DISPOSABLE;
        h->sh.i_type = SLICE_TYPE_B;
    }

    h->fdec->i_type = h->fenc->i_type;
    h->fdec->i_frame = h->fenc->i_frame;
    h->fenc->b_kept_as_ref =
    h->fdec->b_kept_as_ref = i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE && h->param.i_keyint_max > 1;

    h->fdec->i_pts = h->fenc->i_pts;
    if( h->frames.i_bframe_delay )
    {
        int64_t *prev_reordered_pts = thread_current->frames.i_prev_reordered_pts;
        h->fdec->i_dts = h->i_frame > h->frames.i_bframe_delay
                       ? prev_reordered_pts[ (h->i_frame - h->frames.i_bframe_delay) % h->frames.i_bframe_delay ]
                       : h->fenc->i_reordered_pts - h->frames.i_bframe_delay_time;
        prev_reordered_pts[ h->i_frame % h->frames.i_bframe_delay ] = h->fenc->i_reordered_pts;
    }
    else
        h->fdec->i_dts = h->fenc->i_reordered_pts;
    if( h->fenc->i_type == X264_TYPE_IDR )
        h->i_last_idr_pts = h->fdec->i_pts;

    /* ------------------- Init                ----------------------------- */
    /* build ref list 0/1 */
    x264_reference_build_list( h, h->fdec->i_poc );

    /* ---------------------- Write the bitstream -------------------------- */
    /* Init bitstream context */
    if( h->param.b_sliced_threads )
    {
        for( int i = 0; i < h->param.i_threads; i++ )
        {
            bs_init( &h->thread[i]->out.bs, h->thread[i]->out.p_bitstream, h->thread[i]->out.i_bitstream );
            h->thread[i]->out.i_nal = 0;
        }
    }
    else
    {
        bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
        h->out.i_nal = 0;
    }

    if( h->param.b_aud )
    {
        int pic_type;

        if( h->sh.i_type == SLICE_TYPE_I )
            pic_type = 0;
        else if( h->sh.i_type == SLICE_TYPE_P )
            pic_type = 1;
        else if( h->sh.i_type == SLICE_TYPE_B )
            pic_type = 2;
        else
            pic_type = 7;

        x264_nal_start( h, NAL_AUD, NAL_PRIORITY_DISPOSABLE );
        bs_write( &h->out.bs, 3, pic_type );
        bs_rbsp_trailing( &h->out.bs );
        if( x264_nal_end( h ) )
            return -1;
        overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1);
    }

    h->i_nal_type = i_nal_type;
    h->i_nal_ref_idc = i_nal_ref_idc;

    if( h->param.b_intra_refresh )
    {
        if( IS_X264_TYPE_I( h->fenc->i_type ) )
        {
            h->fdec->i_frames_since_pir = 0;
            h->b_queued_intra_refresh = 0;
            /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
             * the whole frame and counts as an intra refresh. */
            h->fdec->f_pir_position = h->mb.i_mb_width;
        }
        else if( h->fenc->i_type == X264_TYPE_P )
        {
            int pocdiff = (h->fdec->i_poc - h->fref[0][0]->i_poc)/2;
            float increment = X264_MAX( ((float)h->mb.i_mb_width-1) / h->param.i_keyint_max, 1 );
            h->fdec->f_pir_position = h->fref[0][0]->f_pir_position;
            h->fdec->i_frames_since_pir = h->fref[0][0]->i_frames_since_pir + pocdiff;
            if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max ||
                (h->b_queued_intra_refresh && h->fdec->f_pir_position + 0.5 >= h->mb.i_mb_width) )
            {
                h->fdec->f_pir_position = 0;
                h->fdec->i_frames_since_pir = 0;
                h->b_queued_intra_refresh = 0;
                h->fenc->b_keyframe = 1;
            }
            h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
            h->fdec->f_pir_position += increment * pocdiff;
            h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
            /* If our intra refresh has reached the right side of the frame, we're done. */
            if( h->fdec->i_pir_end_col >= h->mb.i_mb_width - 1 )
                h->fdec->f_pir_position = h->mb.i_mb_width;
        }
    }

    if( h->fenc->b_keyframe )
    {
        /* Write SPS and PPS */
        if( h->param.b_repeat_headers )
        {
            /* generate sequence parameters */
            x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
            x264_sps_write( &h->out.bs, h->sps );
            if( x264_nal_end( h ) )
                return -1;
            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;

            /* generate picture parameters */
            x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
            x264_pps_write( &h->out.bs, h->sps, h->pps );
            if( x264_nal_end( h ) )
                return -1;
            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
        }

        /* when frame threading is used, buffering period sei is written in x264_encoder_frame_end */
        if( h->i_thread_frames == 1 && h->sps->vui.b_nal_hrd_parameters_present )
        {
            x264_hrd_fullness( h );
            x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
            x264_sei_buffering_period_write( h, &h->out.bs );
            if( x264_nal_end( h ) )
               return -1;
            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
        }
    }

    /* write extra sei */
    for( int i = 0; i < h->fenc->extra_sei.num_payloads; i++ )
    {
        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
        x264_sei_write( &h->out.bs, h->fenc->extra_sei.payloads[i].payload, h->fenc->extra_sei.payloads[i].payload_size,
                        h->fenc->extra_sei.payloads[i].payload_type );
        if( x264_nal_end( h ) )
            return -1;
        overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1);
        if( h->fenc->extra_sei.sei_free && h->fenc->extra_sei.payloads[i].payload )
            h->fenc->extra_sei.sei_free( h->fenc->extra_sei.payloads[i].payload );
    }

    if( h->fenc->extra_sei.sei_free && h->fenc->extra_sei.payloads )
        h->fenc->extra_sei.sei_free( h->fenc->extra_sei.payloads );

    if( h->fenc->b_keyframe )
    {
        if( h->param.b_repeat_headers && h->fenc->i_frame == 0 )
        {
            /* identify ourself */
            x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
            if( x264_sei_version_write( h, &h->out.bs ) )
                return -1;
            if( x264_nal_end( h ) )
                return -1;
            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1);
        }

        if( h->fenc->i_type != X264_TYPE_IDR )
        {
            int time_to_recovery = h->param.b_open_gop ? 0 : X264_MIN( h->mb.i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe - 1;
            x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
            x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery );
            if( x264_nal_end( h ) )
                return -1;
            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1);
        }

        if ( h->param.i_frame_packing >= 0 )
        {
            x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
            x264_sei_frame_packing_write( h, &h->out.bs );
            if( x264_nal_end( h ) )
                return -1;
            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1);
        }
    }

    /* generate sei pic timing */
    if( h->sps->vui.b_pic_struct_present || h->sps->vui.b_nal_hrd_parameters_present )
    {
        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
        x264_sei_pic_timing_write( h, &h->out.bs );
        if( x264_nal_end( h ) )
            return -1;
        overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1);
    }

    /* As required by Blu-ray. */
    if( !IS_X264_TYPE_B( h->fenc->i_type ) && h->b_sh_backup )
    {
        h->b_sh_backup = 0;
        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
        x264_sei_dec_ref_pic_marking_write( h, &h->out.bs );
        if( x264_nal_end( h ) )
            return -1;
        overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1);
    }

    if( h->fenc->b_keyframe && h->param.b_intra_refresh )
        h->i_cpb_delay_pir_offset = h->fenc->i_cpb_delay;

    /* Init the rate control */
    /* FIXME: Include slice header bit cost. */
    x264_ratecontrol_start( h, h->fenc->i_qpplus1, overhead*8 );
    i_global_qp = x264_ratecontrol_qp( h );

    pic_out->i_qpplus1 =
    h->fdec->i_qpplus1 = i_global_qp + 1;

    if( h->param.rc.b_stat_read && h->sh.i_type != SLICE_TYPE_I )
    {
        x264_reference_build_list_optimal( h );
        x264_reference_check_reorder( h );
    }

    if( h->i_ref[0] )
        h->fdec->i_poc_l0ref0 = h->fref[0][0]->i_poc;

    /* ------------------------ Create slice header  ----------------------- */
    x264_slice_init( h, i_nal_type, i_global_qp );

    /*------------------------- Weights -------------------------------------*/
    if( h->sh.i_type == SLICE_TYPE_B )
        x264_macroblock_bipred_init( h );

    x264_weighted_pred_init( h );

    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
        h->i_frame_num++;

    /* Write frame */
    h->i_threadslice_start = 0;
    h->i_threadslice_end = h->mb.i_mb_height;
    if( h->i_thread_frames > 1 )
    {
        x264_threadpool_run( h->threadpool, (void*)x264_slices_write, h );
        h->b_thread_active = 1;
    }
    else if( h->param.b_sliced_threads )
    {
        if( x264_threaded_slices_write( h ) )
            return -1;
    }
    else
        if( (intptr_t)x264_slices_write( h ) )
            return -1;

    return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
}

static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
                                   x264_nal_t **pp_nal, int *pi_nal,
                                   x264_picture_t *pic_out )
{
    char psz_message[80];

    if( h->b_thread_active )
    {
        h->b_thread_active = 0;
        if( (intptr_t)x264_threadpool_wait( h->threadpool, h ) )
            return -1;
    }
    if( !h->out.i_nal )
    {
        pic_out->i_type = X264_TYPE_AUTO;
        return 0;
    }

    x264_emms();
    /* generate buffering period sei and insert it into place */
    if( h->i_thread_frames > 1 && h->fenc->b_keyframe && h->sps->vui.b_nal_hrd_parameters_present )
    {
        x264_hrd_fullness( h );
        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
        x264_sei_buffering_period_write( h, &h->out.bs );
        if( x264_nal_end( h ) )
           return -1;
        /* buffering period sei must follow AUD, SPS and PPS and precede all other SEIs */
        int idx = 0;
        while( h->out.nal[idx].i_type == NAL_AUD ||
               h->out.nal[idx].i_type == NAL_SPS ||
               h->out.nal[idx].i_type == NAL_PPS )
            idx++;
        x264_nal_t nal_tmp = h->out.nal[h->out.i_nal-1];
        memmove( &h->out.nal[idx+1], &h->out.nal[idx], (h->out.i_nal-idx-1)*sizeof(x264_nal_t) );
        h->out.nal[idx] = nal_tmp;
    }

    int frame_size = x264_encoder_encapsulate_nals( h, 0 );
    if( frame_size < 0 )
        return -1;

    /* Set output picture properties */
    pic_out->i_type = h->fenc->i_type;

    pic_out->b_keyframe = h->fenc->b_keyframe;
    pic_out->i_pic_struct = h->fenc->i_pic_struct;

    pic_out->i_pts = h->fdec->i_pts;
    pic_out->i_dts = h->fdec->i_dts;

    if( pic_out->i_pts < pic_out->i_dts )
        x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" );

    pic_out->img.i_csp = h->fdec->i_csp;
#if HIGH_BIT_DEPTH
    pic_out->img.i_csp |= X264_CSP_HIGH_DEPTH;
#endif
    pic_out->img.i_plane = h->fdec->i_plane;
    for( int i = 0; i < pic_out->img.i_plane; i++ )
    {
        pic_out->img.i_stride[i] = h->fdec->i_stride[i] * sizeof(pixel);
        pic_out->img.plane[i] = (uint8_t*)h->fdec->plane[i];
    }

    x264_frame_push_unused( thread_current, h->fenc );

    /* ---------------------- Update encoder state ------------------------- */

    /* update rc */
    int filler = 0;
    if( x264_ratecontrol_end( h, frame_size * 8, &filler ) < 0 )
        return -1;

    pic_out->hrd_timing = h->fenc->hrd_timing;

    while( filler > 0 )
    {
        int f, overhead;
        overhead = (FILLER_OVERHEAD - h->param.b_annexb);
        if( h->param.i_slice_max_size && filler > h->param.i_slice_max_size )
        {
            int next_size = filler - h->param.i_slice_max_size;
            int overflow = X264_MAX( overhead - next_size, 0 );
            f = h->param.i_slice_max_size - overhead - overflow;
        }
        else
            f = X264_MAX( 0, filler - overhead );

        x264_nal_start( h, NAL_FILLER, NAL_PRIORITY_DISPOSABLE );
        x264_filler_write( h, &h->out.bs, f );
        if( x264_nal_end( h ) )
            return -1;
        int total_size = x264_encoder_encapsulate_nals( h, h->out.i_nal-1 );
        if( total_size < 0 )
            return -1;
        frame_size += total_size;
        filler -= total_size;
    }

    /* End bitstream, set output  */
    *pi_nal = h->out.i_nal;
    *pp_nal = h->out.nal;

    h->out.i_nal = 0;

    x264_noise_reduction_update( h );

    /* ---------------------- Compute/Print statistics --------------------- */
    x264_thread_sync_stat( h, h->thread[0] );

    /* Slice stat */
    h->stat.i_frame_count[h->sh.i_type]++;
    h->stat.i_frame_size[h->sh.i_type] += frame_size;
    h->stat.f_frame_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;

    for( int i = 0; i < X264_MBTYPE_MAX; i++ )
        h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];
    for( int i = 0; i < X264_PARTTYPE_MAX; i++ )
        h->stat.i_mb_partition[h->sh.i_type][i] += h->stat.frame.i_mb_partition[i];
    for( int i = 0; i < 2; i++ )
        h->stat.i_mb_count_8x8dct[i] += h->stat.frame.i_mb_count_8x8dct[i];
    for( int i = 0; i < 6; i++ )
        h->stat.i_mb_cbp[i] += h->stat.frame.i_mb_cbp[i];
    for( int i = 0; i < 4; i++ )
        for( int j = 0; j < 13; j++ )
            h->stat.i_mb_pred_mode[i][j] += h->stat.frame.i_mb_pred_mode[i][j];
    if( h->sh.i_type != SLICE_TYPE_I )
        for( int i_list = 0; i_list < 2; i_list++ )
            for( int i = 0; i < X264_REF_MAX*2; i++ )
                h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i];
    for( int i = 0; i < 3; i++ )
        h->stat.i_mb_field[i] += h->stat.frame.i_mb_field[i];
    if( h->sh.i_type == SLICE_TYPE_P && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
    {
        h->stat.i_wpred[0] += !!h->sh.weight[0][0].weightfn;
        h->stat.i_wpred[1] += !!h->sh.weight[0][1].weightfn || !!h->sh.weight[0][2].weightfn;
    }
    if( h->sh.i_type == SLICE_TYPE_B )
    {
        h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++;
        if( h->mb.b_direct_auto_write )
        {
            //FIXME somewhat arbitrary time constants
            if( h->stat.i_direct_score[0] + h->stat.i_direct_score[1] > h->mb.i_mb_count )
                for( int i = 0; i < 2; i++ )
                    h->stat.i_direct_score[i] = h->stat.i_direct_score[i] * 9/10;
            for( int i = 0; i < 2; i++ )
                h->stat.i_direct_score[i] += h->stat.frame.i_direct_score[i];
        }
    }
    else
        h->stat.i_consecutive_bframes[h->fenc->i_bframes]++;

    psz_message[0] = '\0';
    double dur = h->fenc->f_duration;
    h->stat.f_frame_duration[h->sh.i_type] += dur;
    if( h->param.analyse.b_psnr )
    {
        int64_t ssd[3] =
        {
            h->stat.frame.i_ssd[0],
            h->stat.frame.i_ssd[1],
            h->stat.frame.i_ssd[2],
        };
        int luma_size = h->param.i_width * h->param.i_height;
        int chroma_size = CHROMA_SIZE( luma_size );
        double psnr_y = x264_psnr( ssd[0], luma_size );
        double psnr_u = x264_psnr( ssd[1], chroma_size );
        double psnr_v = x264_psnr( ssd[2], chroma_size );

        h->stat.f_ssd_global[h->sh.i_type]   += dur * (ssd[0] + ssd[1] + ssd[2]);
        h->stat.f_psnr_average[h->sh.i_type] += dur * x264_psnr( ssd[0] + ssd[1] + ssd[2], luma_size + chroma_size*2 );
        h->stat.f_psnr_mean_y[h->sh.i_type]  += dur * psnr_y;
        h->stat.f_psnr_mean_u[h->sh.i_type]  += dur * psnr_u;
        h->stat.f_psnr_mean_v[h->sh.i_type]  += dur * psnr_v;

        snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f", psnr_y, psnr_u, psnr_v );
    }

    if( h->param.analyse.b_ssim )
    {
        double ssim_y = h->stat.frame.f_ssim
                      / h->stat.frame.i_ssim_cnt;
        h->stat.f_ssim_mean_y[h->sh.i_type] += ssim_y * dur;
        snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message),
                  " SSIM Y:%.5f", ssim_y );
    }
    psz_message[79] = '\0';

    x264_log( h, X264_LOG_DEBUG,
                  "frame=%4d QP=%.2f NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",
              h->i_frame,
              h->fdec->f_qp_avg_aq,
              h->i_nal_ref_idc,
              h->sh.i_type == SLICE_TYPE_I ? 'I' : (h->sh.i_type == SLICE_TYPE_P ? 'P' : 'B' ),
              h->fdec->i_poc,
              h->stat.frame.i_mb_count_i,
              h->stat.frame.i_mb_count_p,
              h->stat.frame.i_mb_count_skip,
              frame_size,
              psz_message );

    // keep stats all in one place
    x264_thread_sync_stat( h->thread[0], h );
    // for the use of the next frame
    x264_thread_sync_stat( thread_current, h );

#ifdef DEBUG_MB_TYPE
{
    static const char mb_chars[] = { 'i', 'i', 'I', 'C', 'P', '8', 'S',
        'D', '<', 'X', 'B', 'X', '>', 'B', 'B', 'B', 'B', '8', 'S' };
    for( int mb_xy = 0; mb_xy < h->mb.i_mb_width * h->mb.i_mb_height; mb_xy++ )
    {
        if( h->mb.type[mb_xy] < X264_MBTYPE_MAX && h->mb.type[mb_xy] >= 0 )
            fprintf( stderr, "%c ", mb_chars[ h->mb.type[mb_xy] ] );
        else
            fprintf( stderr, "? " );

        if( (mb_xy+1) % h->mb.i_mb_width == 0 )
            fprintf( stderr, "\n" );
    }
}
#endif

    /* Remove duplicates, must be done near the end as breaks h->fref0 array
     * by freeing some of its pointers. */
    for( int i = 0; i < h->i_ref[0]; i++ )
        if( h->fref[0][i] && h->fref[0][i]->b_duplicate )
        {
            x264_frame_push_blank_unused( h, h->fref[0][i] );
            h->fref[0][i] = 0;
        }

    if( h->param.psz_dump_yuv )
        x264_frame_dump( h );
    x264_emms();

    return frame_size;
}

static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_pcm, char *intra )
{
    intra += sprintf( intra, "I16..4%s: %4.1f%% %4.1f%% %4.1f%%",
        b_print_pcm ? "..PCM" : "",
        i_mb_count[I_16x16]/ i_count,
        i_mb_count[I_8x8]  / i_count,
        i_mb_count[I_4x4]  / i_count );
    if( b_print_pcm )
        sprintf( intra, " %4.1f%%", i_mb_count[I_PCM]  / i_count );
}

/****************************************************************************
 * x264_encoder_close:
 ****************************************************************************/
void    x264_encoder_close  ( x264_t *h )
{
    int64_t i_yuv_size = FRAME_SIZE( h->param.i_width * h->param.i_height );
    int64_t i_mb_count_size[2][7] = {{0}};
    char buf[200];
    int b_print_pcm = h->stat.i_mb_count[SLICE_TYPE_I][I_PCM]
                   || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
                   || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];

    x264_lookahead_delete( h );

    if( h->param.i_threads > 1 )
        x264_threadpool_delete( h->threadpool );
    if( h->i_thread_frames > 1 )
    {
        for( int i = 0; i < h->i_thread_frames; i++ )
            if( h->thread[i]->b_thread_active )
            {
                assert( h->thread[i]->fenc->i_reference_count == 1 );
                x264_frame_delete( h->thread[i]->fenc );
            }

        x264_t *thread_prev = h->thread[h->i_thread_phase];
        x264_thread_sync_ratecontrol( h, thread_prev, h );
        x264_thread_sync_ratecontrol( thread_prev, thread_prev, h );
        h->i_frame = thread_prev->i_frame + 1 - h->i_thread_frames;
    }
    h->i_frame++;

    /* Slices used and PSNR */
    for( int i = 0; i < 3; i++ )
    {
        static const uint8_t slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_P, SLICE_TYPE_B };
        int i_slice = slice_order[i];

        if( h->stat.i_frame_count[i_slice] > 0 )
        {
            int i_count = h->stat.i_frame_count[i_slice];
            double dur =  h->stat.f_frame_duration[i_slice];
            if( h->param.analyse.b_psnr )
            {
                x264_log( h, X264_LOG_INFO,
                          "frame %c:%-5d Avg QP:%5.2f  size:%6.0f  PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n",
                          slice_type_to_char[i_slice],
                          i_count,
                          h->stat.f_frame_qp[i_slice] / i_count,
                          (double)h->stat.i_frame_size[i_slice] / i_count,
                          h->stat.f_psnr_mean_y[i_slice] / dur, h->stat.f_psnr_mean_u[i_slice] / dur, h->stat.f_psnr_mean_v[i_slice] / dur,
                          h->stat.f_psnr_average[i_slice] / dur,
                          x264_psnr( h->stat.f_ssd_global[i_slice], dur * i_yuv_size ) );
            }
            else
            {
                x264_log( h, X264_LOG_INFO,
                          "frame %c:%-5d Avg QP:%5.2f  size:%6.0f\n",
                          slice_type_to_char[i_slice],
                          i_count,
                          h->stat.f_frame_qp[i_slice] / i_count,
                          (double)h->stat.i_frame_size[i_slice] / i_count );
            }
        }
    }
    if( h->param.i_bframe && h->stat.i_frame_count[SLICE_TYPE_B] )
    {
        char *p = buf;
        int den = 0;
        // weight by number of frames (including the I/P-frames) that are in a sequence of N B-frames
        for( int i = 0; i <= h->param.i_bframe; i++ )
            den += (i+1) * h->stat.i_consecutive_bframes[i];
        for( int i = 0; i <= h->param.i_bframe; i++ )
            p += sprintf( p, " %4.1f%%", 100. * (i+1) * h->stat.i_consecutive_bframes[i] / den );
        x264_log( h, X264_LOG_INFO, "consecutive B-frames:%s\n", buf );
    }

    for( int i_type = 0; i_type < 2; i_type++ )
        for( int i = 0; i < X264_PARTTYPE_MAX; i++ )
        {
            if( i == D_DIRECT_8x8 ) continue; /* direct is counted as its own type */
            i_mb_count_size[i_type][x264_mb_partition_pixel_table[i]] += h->stat.i_mb_partition[i_type][i];
        }

    /* MB types used */
    if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 )
    {
        int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I];
        double i_count = h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
        x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
        x264_log( h, X264_LOG_INFO, "mb I  %s\n", buf );
    }
    if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
    {
        int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P];
        double i_count = h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
        int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P];
        x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
        x264_log( h, X264_LOG_INFO,
                  "mb P  %s  P16..4: %4.1f%% %4.1f%% %4.1f%% %4.1f%% %4.1f%%    skip:%4.1f%%\n",
                  buf,
                  i_mb_size[PIXEL_16x16] / (i_count*4),
                  (i_mb_size[PIXEL_16x8] + i_mb_size[PIXEL_8x16]) / (i_count*4),
                  i_mb_size[PIXEL_8x8] / (i_count*4),
                  (i_mb_size[PIXEL_8x4] + i_mb_size[PIXEL_4x8]) / (i_count*4),
                  i_mb_size[PIXEL_4x4] / (i_count*4),
                  i_mb_count[P_SKIP] / i_count );
    }
    if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
    {
        int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B];
        double i_count = h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
        double i_mb_list_count;
        int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B];
        int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */
        x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
        for( int i = 0; i < X264_PARTTYPE_MAX; i++ )
            for( int j = 0; j < 2; j++ )
            {
                int l0 = x264_mb_type_list_table[i][0][j];
                int l1 = x264_mb_type_list_table[i][1][j];
                if( l0 || l1 )
                    list_count[l1+l0*l1] += h->stat.i_mb_count[SLICE_TYPE_B][i] * 2;
            }
        list_count[0] += h->stat.i_mb_partition[SLICE_TYPE_B][D_L0_8x8];
        list_count[1] += h->stat.i_mb_partition[SLICE_TYPE_B][D_L1_8x8];
        list_count[2] += h->stat.i_mb_partition[SLICE_TYPE_B][D_BI_8x8];
        i_mb_count[B_DIRECT] += (h->stat.i_mb_partition[SLICE_TYPE_B][D_DIRECT_8x8]+2)/4;
        i_mb_list_count = (list_count[0] + list_count[1] + list_count[2]) / 100.0;
        sprintf( buf + strlen(buf), "  B16..8: %4.1f%% %4.1f%% %4.1f%%  direct:%4.1f%%  skip:%4.1f%%",
                 i_mb_size[PIXEL_16x16] / (i_count*4),
                 (i_mb_size[PIXEL_16x8] + i_mb_size[PIXEL_8x16]) / (i_count*4),
                 i_mb_size[PIXEL_8x8] / (i_count*4),
                 i_mb_count[B_DIRECT] / i_count,
                 i_mb_count[B_SKIP]   / i_count );
        if( i_mb_list_count != 0 )
            sprintf( buf + strlen(buf), "  L0:%4.1f%% L1:%4.1f%% BI:%4.1f%%",
                     list_count[0] / i_mb_list_count,
                     list_count[1] / i_mb_list_count,
                     list_count[2] / i_mb_list_count );
        x264_log( h, X264_LOG_INFO, "mb B  %s\n", buf );
    }

    x264_ratecontrol_summary( h );

    if( h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
    {
#define SUM3(p) (p[SLICE_TYPE_I] + p[SLICE_TYPE_P] + p[SLICE_TYPE_B])
#define SUM3b(p,o) (p[SLICE_TYPE_I][o] + p[SLICE_TYPE_P][o] + p[SLICE_TYPE_B][o])
        int64_t i_i8x8 = SUM3b( h->stat.i_mb_count, I_8x8 );
        int64_t i_intra = i_i8x8 + SUM3b( h->stat.i_mb_count, I_4x4 )
                                 + SUM3b( h->stat.i_mb_count, I_16x16 );
        int64_t i_all_intra = i_intra + SUM3b( h->stat.i_mb_count, I_PCM);
        int64_t i_skip = SUM3b( h->stat.i_mb_count, P_SKIP )
                       + SUM3b( h->stat.i_mb_count, B_SKIP );
        const int i_count = h->stat.i_frame_count[SLICE_TYPE_I] +
                            h->stat.i_frame_count[SLICE_TYPE_P] +
                            h->stat.i_frame_count[SLICE_TYPE_B];
        int64_t i_mb_count = (int64_t)i_count * h->mb.i_mb_count;
        int64_t i_inter = i_mb_count - i_skip - i_intra;
        const double duration = h->stat.f_frame_duration[SLICE_TYPE_I] +
                                h->stat.f_frame_duration[SLICE_TYPE_P] +
                                h->stat.f_frame_duration[SLICE_TYPE_B];
        float f_bitrate = SUM3(h->stat.i_frame_size) / duration / 125;

        if( PARAM_INTERLACED )
        {
            char *fieldstats = buf;
            fieldstats[0] = 0;
            if( i_inter )
                fieldstats += sprintf( fieldstats, " inter:%.1f%%", h->stat.i_mb_field[1] * 100.0 / i_inter );
            if( i_skip )
                fieldstats += sprintf( fieldstats, " skip:%.1f%%", h->stat.i_mb_field[2] * 100.0 / i_skip );
            x264_log( h, X264_LOG_INFO, "field mbs: intra: %.1f%%%s\n",
                      h->stat.i_mb_field[0] * 100.0 / i_intra, buf );
        }

        if( h->pps->b_transform_8x8_mode )
        {
            buf[0] = 0;
            if( h->stat.i_mb_count_8x8dct[0] )
                sprintf( buf, " inter:%.1f%%", 100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] );
            x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
        }

        if( (h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ||
            (h->stat.i_direct_frames[0] && h->stat.i_direct_frames[1]))
            && h->stat.i_frame_count[SLICE_TYPE_B] )
        {
            x264_log( h, X264_LOG_INFO, "direct mvs  spatial:%.1f%% temporal:%.1f%%\n",
                      h->stat.i_direct_frames[1] * 100. / h->stat.i_frame_count[SLICE_TYPE_B],
                      h->stat.i_direct_frames[0] * 100. / h->stat.i_frame_count[SLICE_TYPE_B] );
        }

        buf[0] = 0;
        int csize = CHROMA444 ? 4 : 1;
        if( i_mb_count != i_all_intra )
            sprintf( buf, " inter: %.1f%% %.1f%% %.1f%%",
                     h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
                     h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)*csize),
                     h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)*csize) );
        x264_log( h, X264_LOG_INFO, "coded y,%s,%s intra: %.1f%% %.1f%% %.1f%%%s\n",
                  CHROMA444?"u":"uvDC", CHROMA444?"v":"uvAC",
                  h->stat.i_mb_cbp[0] * 100.0 / (i_all_intra*4),
                  h->stat.i_mb_cbp[2] * 100.0 / (i_all_intra*csize),
                  h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra*csize), buf );

        int64_t fixed_pred_modes[4][9] = {{0}};
        int64_t sum_pred_modes[4] = {0};
        for( int i = 0; i <= I_PRED_16x16_DC_128; i++ )
        {
            fixed_pred_modes[0][x264_mb_pred_mode16x16_fix[i]] += h->stat.i_mb_pred_mode[0][i];
            sum_pred_modes[0] += h->stat.i_mb_pred_mode[0][i];
        }
        if( sum_pred_modes[0] )
            x264_log( h, X264_LOG_INFO, "i16 v,h,dc,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n",
                      fixed_pred_modes[0][0] * 100.0 / sum_pred_modes[0],
                      fixed_pred_modes[0][1] * 100.0 / sum_pred_modes[0],
                      fixed_pred_modes[0][2] * 100.0 / sum_pred_modes[0],
                      fixed_pred_modes[0][3] * 100.0 / sum_pred_modes[0] );
        for( int i = 1; i <= 2; i++ )
        {
            for( int j = 0; j <= I_PRED_8x8_DC_128; j++ )
            {
                fixed_pred_modes[i][x264_mb_pred_mode4x4_fix(j)] += h->stat.i_mb_pred_mode[i][j];
                sum_pred_modes[i] += h->stat.i_mb_pred_mode[i][j];
            }
            if( sum_pred_modes[i] )
                x264_log( h, X264_LOG_INFO, "i%d v,h,dc,ddl,ddr,vr,hd,vl,hu: %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n", (3-i)*4,
                          fixed_pred_modes[i][0] * 100.0 / sum_pred_modes[i],
                          fixed_pred_modes[i][1] * 100.0 / sum_pred_modes[i],
                          fixed_pred_modes[i][2] * 100.0 / sum_pred_modes[i],
                          fixed_pred_modes[i][3] * 100.0 / sum_pred_modes[i],
                          fixed_pred_modes[i][4] * 100.0 / sum_pred_modes[i],
                          fixed_pred_modes[i][5] * 100.0 / sum_pred_modes[i],
                          fixed_pred_modes[i][6] * 100.0 / sum_pred_modes[i],
                          fixed_pred_modes[i][7] * 100.0 / sum_pred_modes[i],
                          fixed_pred_modes[i][8] * 100.0 / sum_pred_modes[i] );
        }
        for( int i = 0; i <= I_PRED_CHROMA_DC_128; i++ )
        {
            fixed_pred_modes[3][x264_mb_chroma_pred_mode_fix[i]] += h->stat.i_mb_pred_mode[3][i];
            sum_pred_modes[3] += h->stat.i_mb_pred_mode[3][i];
        }
        if( sum_pred_modes[3] && !CHROMA444 )
            x264_log( h, X264_LOG_INFO, "i8c dc,h,v,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n",
                      fixed_pred_modes[3][0] * 100.0 / sum_pred_modes[3],
                      fixed_pred_modes[3][1] * 100.0 / sum_pred_modes[3],
                      fixed_pred_modes[3][2] * 100.0 / sum_pred_modes[3],
                      fixed_pred_modes[3][3] * 100.0 / sum_pred_modes[3] );

        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
            x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%% UV:%.1f%%\n",
                      h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P],
                      h->stat.i_wpred[1] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );

        for( int i_list = 0; i_list < 2; i_list++ )
            for( int i_slice = 0; i_slice < 2; i_slice++ )
            {
                char *p = buf;
                int64_t i_den = 0;
                int i_max = 0;
                for( int i = 0; i < X264_REF_MAX*2; i++ )
                    if( h->stat.i_mb_count_ref[i_slice][i_list][i] )
                    {
                        i_den += h->stat.i_mb_count_ref[i_slice][i_list][i];
                        i_max = i;
                    }
                if( i_max == 0 )
                    continue;
                for( int i = 0; i <= i_max; i++ )
                    p += sprintf( p, " %4.1f%%", 100. * h->stat.i_mb_count_ref[i_slice][i_list][i] / i_den );
                x264_log( h, X264_LOG_INFO, "ref %c L%d:%s\n", "PB"[i_slice], i_list, buf );
            }

        if( h->param.analyse.b_ssim )
        {
            float ssim = SUM3( h->stat.f_ssim_mean_y ) / duration;
            x264_log( h, X264_LOG_INFO, "SSIM Mean Y:%.7f (%6.3fdb)\n", ssim, x264_ssim( ssim ) );
        }
        if( h->param.analyse.b_psnr )
        {
            x264_log( h, X264_LOG_INFO,
                      "PSNR Mean Y:%6.3f U:%6.3f V:%6.3f Avg:%6.3f Global:%6.3f kb/s:%.2f\n",
                      SUM3( h->stat.f_psnr_mean_y ) / duration,
                      SUM3( h->stat.f_psnr_mean_u ) / duration,
                      SUM3( h->stat.f_psnr_mean_v ) / duration,
                      SUM3( h->stat.f_psnr_average ) / duration,
                      x264_psnr( SUM3( h->stat.f_ssd_global ), duration * i_yuv_size ),
                      f_bitrate );
        }
        else
            x264_log( h, X264_LOG_INFO, "kb/s:%.2f\n", f_bitrate );
    }

    /* rc */
    x264_ratecontrol_delete( h );

    /* param */
    if( h->param.rc.psz_stat_out )
        free( h->param.rc.psz_stat_out );
    if( h->param.rc.psz_stat_in )
        free( h->param.rc.psz_stat_in );

    x264_cqm_delete( h );
    x264_free( h->nal_buffer );
    x264_analyse_free_costs( h );

    if( h->i_thread_frames > 1)
        h = h->thread[h->i_thread_phase];

    /* frames */
    x264_frame_delete_list( h->frames.unused[0] );
    x264_frame_delete_list( h->frames.unused[1] );
    x264_frame_delete_list( h->frames.current );
    x264_frame_delete_list( h->frames.blank_unused );

    h = h->thread[0];

    for( int i = 0; i < h->i_thread_frames; i++ )
        if( h->thread[i]->b_thread_active )
            for( int j = 0; j < h->thread[i]->i_ref[0]; j++ )
                if( h->thread[i]->fref[0][j] && h->thread[i]->fref[0][j]->b_duplicate )
                    x264_frame_delete( h->thread[i]->fref[0][j] );

    for( int i = h->param.i_threads - 1; i >= 0; i-- )
    {
        x264_frame_t **frame;

        if( !h->param.b_sliced_threads || i == 0 )
        {
            for( frame = h->thread[i]->frames.reference; *frame; frame++ )
            {
                assert( (*frame)->i_reference_count > 0 );
                (*frame)->i_reference_count--;
                if( (*frame)->i_reference_count == 0 )
                    x264_frame_delete( *frame );
            }
            frame = &h->thread[i]->fdec;
            if( *frame )
            {
                assert( (*frame)->i_reference_count > 0 );
                (*frame)->i_reference_count--;
                if( (*frame)->i_reference_count == 0 )
                    x264_frame_delete( *frame );
            }
            x264_macroblock_cache_free( h->thread[i] );
        }
        x264_macroblock_thread_free( h->thread[i], 0 );
        x264_free( h->thread[i]->out.p_bitstream );
        x264_free( h->thread[i]->out.nal);
        x264_free( h->thread[i] );
    }
}

int x264_encoder_delayed_frames( x264_t *h )
{
    int delayed_frames = 0;
    if( h->i_thread_frames > 1 )
    {
        for( int i = 0; i < h->i_thread_frames; i++ )
            delayed_frames += h->thread[i]->b_thread_active;
        h = h->thread[h->i_thread_phase];
    }
    for( int i = 0; h->frames.current[i]; i++ )
        delayed_frames++;
    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
    delayed_frames += h->lookahead->ifbuf.i_size + h->lookahead->next.i_size + h->lookahead->ofbuf.i_size;
    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
    return delayed_frames;
}

int x264_encoder_maximum_delayed_frames( x264_t *h )
{
    return h->frames.i_delay;
}
x264-snapshot-20120103-2245-stable/encoder/cabac.c0000644000175000001440000013362511700673342020517 0ustar  videolanusers/*****************************************************************************
 * cabac.c: cabac bitstream writing
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "macroblock.h"

#ifndef RDO_SKIP_BS
#define RDO_SKIP_BS 0
#endif

static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
                    int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
{
    if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
    {
        x264_cabac_encode_decision_noup( cb, ctx0, 0 );
    }
#if !RDO_SKIP_BS
    else if( i_mb_type == I_PCM )
    {
        x264_cabac_encode_decision_noup( cb, ctx0, 1 );
        x264_cabac_encode_flush( h, cb );
    }
#endif
    else
    {
        int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];

        x264_cabac_encode_decision_noup( cb, ctx0, 1 );
        x264_cabac_encode_terminal( cb );

        x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
        if( h->mb.i_cbp_chroma == 0 )
            x264_cabac_encode_decision_noup( cb, ctx2, 0 );
        else
        {
            x264_cabac_encode_decision( cb, ctx2, 1 );
            x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 );
        }
        x264_cabac_encode_decision( cb, ctx4, i_pred>>1 );
        x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 );
    }
}

#if !RDO_SKIP_BS
static void x264_cabac_field_decoding_flag( x264_t *h, x264_cabac_t *cb )
{
    int ctx = 0;
    ctx += h->mb.field_decoding_flag & !!h->mb.i_mb_x;
    ctx += (h->mb.i_mb_top_mbpair_xy >= 0
            && h->mb.slice_table[h->mb.i_mb_top_mbpair_xy] == h->sh.i_first_mb
            && h->mb.field[h->mb.i_mb_top_mbpair_xy]);

    x264_cabac_encode_decision_noup( cb, 70 + ctx, MB_INTERLACED );
    h->mb.field_decoding_flag = MB_INTERLACED;
}
#endif

static void x264_cabac_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode )
{
    if( i_pred == i_mode )
        x264_cabac_encode_decision( cb, 68, 1 );
    else
    {
        x264_cabac_encode_decision( cb, 68, 0 );
        if( i_mode > i_pred  )
            i_mode--;
        x264_cabac_encode_decision( cb, 69, (i_mode     )&0x01 );
        x264_cabac_encode_decision( cb, 69, (i_mode >> 1)&0x01 );
        x264_cabac_encode_decision( cb, 69, (i_mode >> 2)      );
    }
}

static void x264_cabac_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
{
    int i_mode = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
    int ctx = 0;

    /* No need to test for I4x4 or I_16x16 as cache_save handle that */
    if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 )
        ctx++;
    if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 )
        ctx++;

    x264_cabac_encode_decision_noup( cb, 64 + ctx, i_mode > 0 );
    if( i_mode > 0 )
    {
        x264_cabac_encode_decision( cb, 64 + 3, i_mode > 1 );
        if( i_mode > 1 )
            x264_cabac_encode_decision_noup( cb, 64 + 3, i_mode > 2 );
    }
}

static void x264_cabac_cbp_luma( x264_t *h, x264_cabac_t *cb )
{
    int cbp = h->mb.i_cbp_luma;
    int cbp_l = h->mb.cache.i_cbp_left;
    int cbp_t = h->mb.cache.i_cbp_top;
    x264_cabac_encode_decision     ( cb, 76 - ((cbp_l >> 1) & 1) - ((cbp_t >> 1) & 2), (cbp >> 0) & 1 );
    x264_cabac_encode_decision     ( cb, 76 - ((cbp   >> 0) & 1) - ((cbp_t >> 2) & 2), (cbp >> 1) & 1 );
    x264_cabac_encode_decision     ( cb, 76 - ((cbp_l >> 3) & 1) - ((cbp   << 1) & 2), (cbp >> 2) & 1 );
    x264_cabac_encode_decision_noup( cb, 76 - ((cbp   >> 2) & 1) - ((cbp   >> 0) & 2), (cbp >> 3) & 1 );
}

static void x264_cabac_cbp_chroma( x264_t *h, x264_cabac_t *cb )
{
    int cbp_a = h->mb.cache.i_cbp_left & 0x30;
    int cbp_b = h->mb.cache.i_cbp_top  & 0x30;
    int ctx = 0;

    if( cbp_a && h->mb.cache.i_cbp_left != -1 ) ctx++;
    if( cbp_b && h->mb.cache.i_cbp_top  != -1 ) ctx+=2;
    if( h->mb.i_cbp_chroma == 0 )
        x264_cabac_encode_decision_noup( cb, 77 + ctx, 0 );
    else
    {
        x264_cabac_encode_decision_noup( cb, 77 + ctx, 1 );

        ctx = 4;
        if( cbp_a == 0x20 ) ctx++;
        if( cbp_b == 0x20 ) ctx += 2;
        x264_cabac_encode_decision_noup( cb, 77 + ctx, h->mb.i_cbp_chroma >> 1 );
    }
}

static void x264_cabac_qp_delta( x264_t *h, x264_cabac_t *cb )
{
    int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
    int ctx;

    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
    if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
    {
#if !RDO_SKIP_BS
        h->mb.i_qp = h->mb.i_last_qp;
#endif
        i_dqp = 0;
    }

    /* Since, per the above, empty-CBP I16x16 blocks never have delta quants,
     * we don't have to check for them. */
    ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy];

    if( i_dqp != 0 )
    {
        int val = i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp - 1);
        /* dqp is interpreted modulo (QP_MAX_SPEC+1) */
        if( val >= QP_MAX_SPEC && val != QP_MAX_SPEC+1 )
            val = 2*QP_MAX_SPEC+1 - val;
        do
        {
            x264_cabac_encode_decision( cb, 60 + ctx, 1 );
            ctx = 2+(ctx>>1);
        } while( --val );
    }
    x264_cabac_encode_decision_noup( cb, 60 + ctx, 0 );
}

#if !RDO_SKIP_BS
void x264_cabac_mb_skip( x264_t *h, int b_skip )
{
    int ctx = h->mb.cache.i_neighbour_skip + 11;
    if( h->sh.i_type != SLICE_TYPE_P )
       ctx += 13;
    x264_cabac_encode_decision( &h->cabac, ctx, b_skip );
}
#endif

static inline void x264_cabac_subpartition_p( x264_cabac_t *cb, int i_sub )
{
    if( i_sub == D_L0_8x8 )
    {
        x264_cabac_encode_decision( cb, 21, 1 );
        return;
    }
    x264_cabac_encode_decision( cb, 21, 0 );
    if( i_sub == D_L0_8x4 )
        x264_cabac_encode_decision( cb, 22, 0 );
    else
    {
        x264_cabac_encode_decision( cb, 22, 1 );
        x264_cabac_encode_decision( cb, 23, i_sub == D_L0_4x8 );
    }
}

static ALWAYS_INLINE void x264_cabac_subpartition_b( x264_cabac_t *cb, int i_sub )
{
    if( i_sub == D_DIRECT_8x8 )
    {
        x264_cabac_encode_decision( cb, 36, 0 );
        return;
    }
    x264_cabac_encode_decision( cb, 36, 1 );
    if( i_sub == D_BI_8x8 )
    {
        x264_cabac_encode_decision( cb, 37, 1 );
        x264_cabac_encode_decision( cb, 38, 0 );
        x264_cabac_encode_decision( cb, 39, 0 );
        x264_cabac_encode_decision( cb, 39, 0 );
        return;
    }
    x264_cabac_encode_decision( cb, 37, 0 );
    x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 );
}

static ALWAYS_INLINE void x264_cabac_transform_size( x264_t *h, x264_cabac_t *cb )
{
    int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
    x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 );
}

static ALWAYS_INLINE void x264_cabac_ref_internal( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int bframe )
{
    const int i8 = x264_scan8[idx];
    const int i_refa = h->mb.cache.ref[i_list][i8 - 1];
    const int i_refb = h->mb.cache.ref[i_list][i8 - 8];
    int ctx = 0;

    if( i_refa > 0 && (!bframe || !h->mb.cache.skip[i8 - 1]) )
        ctx++;
    if( i_refb > 0 && (!bframe || !h->mb.cache.skip[i8 - 8]) )
        ctx += 2;

    for( int i_ref = h->mb.cache.ref[i_list][i8]; i_ref > 0; i_ref-- )
    {
        x264_cabac_encode_decision( cb, 54 + ctx, 1 );
        ctx = (ctx>>2)+4;
    }
    x264_cabac_encode_decision( cb, 54 + ctx, 0 );
}

static NOINLINE void x264_cabac_ref_p( x264_t *h, x264_cabac_t *cb, int idx )
{
    x264_cabac_ref_internal( h, cb, 0, idx, 0 );
}
static NOINLINE void x264_cabac_ref_b( x264_t *h, x264_cabac_t *cb, int i_list, int idx )
{
    x264_cabac_ref_internal( h, cb, i_list, idx, 1 );
}

static ALWAYS_INLINE int x264_cabac_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
{
    int ctxbase = l ? 47 : 40;

    if( mvd == 0 )
    {
        x264_cabac_encode_decision( cb, ctxbase + ctx, 0 );
        return 0;
    }

    int i_abs = abs( mvd );
    x264_cabac_encode_decision( cb, ctxbase + ctx, 1 );
#if RDO_SKIP_BS
    if( i_abs <= 3 )
    {
        for( int i = 1; i < i_abs; i++ )
            x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 );
        x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 );
        x264_cabac_encode_bypass( cb, mvd >> 31 );
    }
    else
    {
        x264_cabac_encode_decision( cb, ctxbase + 3, 1 );
        x264_cabac_encode_decision( cb, ctxbase + 4, 1 );
        x264_cabac_encode_decision( cb, ctxbase + 5, 1 );
        if( i_abs < 9 )
        {
            cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
            cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
        }
        else
        {
            cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]];
            cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]];
            x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
        }
    }
#else
    static const uint8_t ctxes[8] = { 3,4,5,6,6,6,6,6 };

    if( i_abs < 9 )
    {
        for( int i = 1; i < i_abs; i++ )
            x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 );
        x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs-1], 0 );
    }
    else
    {
        for( int i = 1; i < 9; i++ )
            x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 );
        x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
    }
    x264_cabac_encode_bypass( cb, mvd >> 31 );
#endif
    /* Since we don't need to keep track of MVDs larger than 66, just cap the value.
     * This lets us store MVDs as 8-bit values instead of 16-bit. */
    return X264_MIN( i_abs, 66 );
}

static NOINLINE uint16_t x264_cabac_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
{
    ALIGNED_4( int16_t mvp[2] );
    int mdx, mdy;

    /* Calculate mvd */
    x264_mb_predict_mv( h, i_list, idx, width, mvp );
    mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
    mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
    uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
                                       h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);

    /* encode */
    mdx = x264_cabac_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF );
    mdy = x264_cabac_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 );

    return pack8to16(mdx,mdy);
}

#define x264_cabac_mvd(h,cb,i_list,idx,width,height)\
do\
{\
    uint16_t mvd = x264_cabac_mvd(h,cb,i_list,idx,width);\
    x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
} while(0)

static inline void x264_cabac_8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
{
    switch( h->mb.i_sub_partition[i] )
    {
        case D_L0_8x8:
            x264_cabac_mvd( h, cb, 0, 4*i, 2, 2 );
            break;
        case D_L0_8x4:
            x264_cabac_mvd( h, cb, 0, 4*i+0, 2, 1 );
            x264_cabac_mvd( h, cb, 0, 4*i+2, 2, 1 );
            break;
        case D_L0_4x8:
            x264_cabac_mvd( h, cb, 0, 4*i+0, 1, 2 );
            x264_cabac_mvd( h, cb, 0, 4*i+1, 1, 2 );
            break;
        case D_L0_4x4:
            x264_cabac_mvd( h, cb, 0, 4*i+0, 1, 1 );
            x264_cabac_mvd( h, cb, 0, 4*i+1, 1, 1 );
            x264_cabac_mvd( h, cb, 0, 4*i+2, 1, 1 );
            x264_cabac_mvd( h, cb, 0, 4*i+3, 1, 1 );
            break;
        default:
            assert(0);
    }
}

static ALWAYS_INLINE void x264_cabac_mb_header_i( x264_t *h, x264_cabac_t *cb, int i_mb_type, int slice_type, int chroma )
{
    if( slice_type == SLICE_TYPE_I )
    {
        int ctx = 0;
        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != I_4x4 )
            ctx++;
        if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != I_4x4 )
            ctx++;

        x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
    }
    else if( slice_type == SLICE_TYPE_P )
    {
        /* prefix */
        x264_cabac_encode_decision_noup( cb, 14, 1 );

        /* suffix */
        x264_cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
    }
    else if( slice_type == SLICE_TYPE_B )
    {
        /* prefix */
        x264_cabac_encode_decision_noup( cb, 27+3,   1 );
        x264_cabac_encode_decision_noup( cb, 27+4,   1 );
        x264_cabac_encode_decision( cb, 27+5,   1 );
        x264_cabac_encode_decision( cb, 27+5,   0 );
        x264_cabac_encode_decision( cb, 27+5,   1 );

        /* suffix */
        x264_cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
    }

    if( i_mb_type == I_PCM )
        return;

    if( i_mb_type != I_16x16 )
    {
        if( h->pps->b_transform_8x8_mode )
            x264_cabac_transform_size( h, cb );

        int di = h->mb.b_transform_8x8 ? 4 : 1;
        for( int i = 0; i < 16; i += di )
        {
            const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
            const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
            x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode );
        }
    }

    if( chroma )
        x264_cabac_intra_chroma_pred_mode( h, cb );
}

static ALWAYS_INLINE void x264_cabac_mb_header_p( x264_t *h, x264_cabac_t *cb, int i_mb_type, int chroma )
{
    if( i_mb_type == P_L0 )
    {
        x264_cabac_encode_decision_noup( cb, 14, 0 );
        if( h->mb.i_partition == D_16x16 )
        {
            x264_cabac_encode_decision_noup( cb, 15, 0 );
            x264_cabac_encode_decision_noup( cb, 16, 0 );
            if( h->mb.pic.i_fref[0] > 1 )
                x264_cabac_ref_p( h, cb, 0 );
            x264_cabac_mvd( h, cb, 0, 0, 4, 4 );
        }
        else if( h->mb.i_partition == D_16x8 )
        {
            x264_cabac_encode_decision_noup( cb, 15, 1 );
            x264_cabac_encode_decision_noup( cb, 17, 1 );
            if( h->mb.pic.i_fref[0] > 1 )
            {
                x264_cabac_ref_p( h, cb, 0 );
                x264_cabac_ref_p( h, cb, 8 );
            }
            x264_cabac_mvd( h, cb, 0, 0, 4, 2 );
            x264_cabac_mvd( h, cb, 0, 8, 4, 2 );
        }
        else //if( h->mb.i_partition == D_8x16 )
        {
            x264_cabac_encode_decision_noup( cb, 15, 1 );
            x264_cabac_encode_decision_noup( cb, 17, 0 );
            if( h->mb.pic.i_fref[0] > 1 )
            {
                x264_cabac_ref_p( h, cb, 0 );
                x264_cabac_ref_p( h, cb, 4 );
            }
            x264_cabac_mvd( h, cb, 0, 0, 2, 4 );
            x264_cabac_mvd( h, cb, 0, 4, 2, 4 );
        }
    }
    else if( i_mb_type == P_8x8 )
    {
        x264_cabac_encode_decision_noup( cb, 14, 0 );
        x264_cabac_encode_decision_noup( cb, 15, 0 );
        x264_cabac_encode_decision_noup( cb, 16, 1 );

        /* sub mb type */
        for( int i = 0; i < 4; i++ )
            x264_cabac_subpartition_p( cb, h->mb.i_sub_partition[i] );

        /* ref 0 */
        if( h->mb.pic.i_fref[0] > 1 )
        {
            x264_cabac_ref_p( h, cb,  0 );
            x264_cabac_ref_p( h, cb,  4 );
            x264_cabac_ref_p( h, cb,  8 );
            x264_cabac_ref_p( h, cb, 12 );
        }

        for( int i = 0; i < 4; i++ )
            x264_cabac_8x8_mvd( h, cb, i );
    }
    else /* intra */
        x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_P, chroma );
}

static ALWAYS_INLINE void x264_cabac_mb_header_b( x264_t *h, x264_cabac_t *cb, int i_mb_type, int chroma )
{
    int ctx = 0;
    if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != B_SKIP && h->mb.i_mb_type_left[0] != B_DIRECT )
        ctx++;
    if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
        ctx++;

    if( i_mb_type == B_DIRECT )
    {
        x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
        return;
    }
    x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );

    if( i_mb_type == B_8x8 )
    {
        x264_cabac_encode_decision_noup( cb, 27+3,   1 );
        x264_cabac_encode_decision_noup( cb, 27+4,   1 );
        x264_cabac_encode_decision( cb, 27+5,   1 );
        x264_cabac_encode_decision( cb, 27+5,   1 );
        x264_cabac_encode_decision_noup( cb, 27+5,   1 );

        /* sub mb type */
        for( int i = 0; i < 4; i++ )
            x264_cabac_subpartition_b( cb, h->mb.i_sub_partition[i] );

        /* ref */
        if( h->mb.pic.i_fref[0] > 1 )
            for( int i = 0; i < 4; i++ )
                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
                    x264_cabac_ref_b( h, cb, 0, 4*i );

        if( h->mb.pic.i_fref[1] > 1 )
            for( int i = 0; i < 4; i++ )
                if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
                    x264_cabac_ref_b( h, cb, 1, 4*i );

        for( int i = 0; i < 4; i++ )
            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
                x264_cabac_mvd( h, cb, 0, 4*i, 2, 2 );

        for( int i = 0; i < 4; i++ )
            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
                x264_cabac_mvd( h, cb, 1, 4*i, 2, 2 );
    }
    else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI )
    {
        /* All B modes */
        static const uint8_t i_mb_bits[9*3] =
        {
            0x31, 0x29, 0x4, /* L0 L0 */
            0x35, 0x2d, 0,   /* L0 L1 */
            0x43, 0x63, 0,   /* L0 BI */
            0x3d, 0x2f, 0,   /* L1 L0 */
            0x39, 0x25, 0x6, /* L1 L1 */
            0x53, 0x73, 0,   /* L1 BI */
            0x4b, 0x6b, 0,   /* BI L0 */
            0x5b, 0x7b, 0,   /* BI L1 */
            0x47, 0x67, 0x21 /* BI BI */
        };

        const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
        int bits = i_mb_bits[idx];

        x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
        x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
        if( bits != 1 )
        {
            x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
            x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
            x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
            if( bits != 1 )
                x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
        }

        const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
        if( h->mb.pic.i_fref[0] > 1 )
        {
            if( b_list[0][0] )
                x264_cabac_ref_b( h, cb, 0, 0 );
            if( b_list[0][1] && h->mb.i_partition != D_16x16 )
                x264_cabac_ref_b( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
        }
        if( h->mb.pic.i_fref[1] > 1 )
        {
            if( b_list[1][0] )
                x264_cabac_ref_b( h, cb, 1, 0 );
            if( b_list[1][1] && h->mb.i_partition != D_16x16 )
                x264_cabac_ref_b( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
        }
        for( int i_list = 0; i_list < 2; i_list++ )
        {
            if( h->mb.i_partition == D_16x16 )
            {
                if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 4, 4 );
            }
            else if( h->mb.i_partition == D_16x8 )
            {
                if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 4, 2 );
                if( b_list[i_list][1] ) x264_cabac_mvd( h, cb, i_list, 8, 4, 2 );
            }
            else //if( h->mb.i_partition == D_8x16 )
            {
                if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 2, 4 );
                if( b_list[i_list][1] ) x264_cabac_mvd( h, cb, i_list, 4, 2, 4 );
            }
        }
    }
    else /* intra */
        x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_B, chroma );
}

static int ALWAYS_INLINE x264_cabac_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra, int b_dc )
{
    static const uint16_t base_ctx[14] = {85,89,93,97,101,1012,460,464,468,1016,472,476,480,1020};

    if( b_dc )
    {
        i_idx -= LUMA_DC;
        if( i_cat == DCT_CHROMA_DC )
        {
            int i_nza = h->mb.cache.i_cbp_left != -1 ? (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1 : b_intra;
            int i_nzb = h->mb.cache.i_cbp_top  != -1 ? (h->mb.cache.i_cbp_top  >> (8 + i_idx)) & 1 : b_intra;
            return base_ctx[i_cat] + 2*i_nzb + i_nza;
        }
        else
        {
            int i_nza = (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1;
            int i_nzb = (h->mb.cache.i_cbp_top  >> (8 + i_idx)) & 1;
            return base_ctx[i_cat] + 2*i_nzb + i_nza;
        }
    }
    else
    {
        int i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
        int i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
        if( x264_constant_p(b_intra) && !b_intra )
            return base_ctx[i_cat] + ((2*i_nzb + i_nza)&0x7f);
        else
        {
            i_nza &= 0x7f + (b_intra << 7);
            i_nzb &= 0x7f + (b_intra << 7);
            return base_ctx[i_cat] + 2*!!i_nzb + !!i_nza;
        }
    }
}

static const uint16_t significant_coeff_flag_offset[2][14] =
{
    { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
    { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
};
static const uint16_t last_coeff_flag_offset[2][14] =
{
    { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 },
    { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 }
};
static const uint16_t coeff_abs_level_m1_offset[14] =
{
    227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
};
static const uint8_t significant_coeff_flag_offset_8x8[2][63] =
{{
    0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
    4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
    7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
   12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12
},{
    0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
    6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
    9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
    9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14
}};
static const uint8_t last_coeff_flag_offset_8x8[63] =
{
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
};
static const uint8_t coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */

// node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
//           4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
/* map node ctx => cabac ctx for level=1 */
static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
/* map node ctx => cabac ctx for level>1 */
static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
/* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that
 * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */
static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 };
static const uint8_t coeff_abs_level_transition[2][8] = {
/* update node ctx after coding a level=1 */
    { 1, 2, 3, 3, 4, 5, 6, 7 },
/* update node ctx after coding a level>1 */
    { 4, 4, 4, 4, 5, 6, 7, 7 }
};
static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};

#if !RDO_SKIP_BS
static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc )
{
    int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
    int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
    int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
    int coeff_idx = -1, node_ctx = 0;
    int last = h->quantf.coeff_last[ctx_block_cat]( l );
    const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
    dctcoef coeffs[64];

#define WRITE_SIGMAP( sig_off, last_off )\
{\
    int i = 0;\
    while( 1 )\
    {\
        if( l[i] )\
        {\
            coeffs[++coeff_idx] = l[i];\
            x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );\
            if( i == last )\
            {\
                x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );\
                break;\
            }\
            else\
                x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );\
        }\
        else\
            x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );\
        if( ++i == count_m1 )\
        {\
            coeffs[++coeff_idx] = l[i];\
            break;\
        }\
    }\
}

    if( chroma422dc )
    {
        int count_m1 = 7;
        WRITE_SIGMAP( coeff_flag_offset_chroma_422_dc[i], coeff_flag_offset_chroma_422_dc[i] )
    }
    else
    {
        int count_m1 = count_cat_m1[ctx_block_cat];
        if( count_m1 == 63 )
        {
            const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
            WRITE_SIGMAP( sig_offset[i], last_coeff_flag_offset_8x8[i] )
        }
        else
            WRITE_SIGMAP( i, i )
    }

    do
    {
        /* write coeff_abs - 1 */
        int coeff = coeffs[coeff_idx];
        int abs_coeff = abs(coeff);
        int coeff_sign = coeff >> 31;
        int ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;

        if( abs_coeff > 1 )
        {
            x264_cabac_encode_decision( cb, ctx, 1 );
            ctx = levelgt1_ctx[node_ctx] + ctx_level;
            for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- )
                x264_cabac_encode_decision( cb, ctx, 1 );
            if( abs_coeff < 15 )
                x264_cabac_encode_decision( cb, ctx, 0 );
            else
                x264_cabac_encode_ue_bypass( cb, 0, abs_coeff - 15 );

            node_ctx = coeff_abs_level_transition[1][node_ctx];
        }
        else
        {
            x264_cabac_encode_decision( cb, ctx, 0 );
            node_ctx = coeff_abs_level_transition[0][node_ctx];
        }

        x264_cabac_encode_bypass( cb, coeff_sign );
    } while( --coeff_idx >= 0 );
}
static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
}
static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
    /* Template a version specifically for chroma 4:2:2 DC in order to avoid
     * slowing down everything else due to the added complexity. */
    x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 1 );
}
#define x264_cabac_block_residual_8x8( h, cb, cat, l ) x264_cabac_block_residual( h, cb, cat, l )
#else

/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct and chroma 4:2:2 dc this is
 * slightly incorrect because the sigmap is not reversible (contexts are repeated). However, there
 * is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */
static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
{
    const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
    int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
    int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
    int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
    int last = h->quantf.coeff_last[ctx_block_cat]( l );
    int coeff_abs = abs(l[last]);
    int ctx = coeff_abs_level1_ctx[0] + ctx_level;
    int node_ctx;
    const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;

    if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) )
    {
        x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
                                    chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
        x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] :
                                    chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
    }

    if( coeff_abs > 1 )
    {
        x264_cabac_encode_decision( cb, ctx, 1 );
        ctx = levelgt1_ctx[0] + ctx_level;
        if( coeff_abs < 15 )
        {
            cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
            cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
        }
        else
        {
            cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]];
            cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]];
            x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 );
        }
        node_ctx = coeff_abs_level_transition[1][0];
    }
    else
    {
        x264_cabac_encode_decision( cb, ctx, 0 );
        node_ctx = coeff_abs_level_transition[0][0];
        x264_cabac_encode_bypass( cb, 0 ); // sign
    }

    for( int i = last-1 ; i >= 0; i-- )
    {
        if( l[i] )
        {
            coeff_abs = abs(l[i]);
            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 1 );
            x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] :
                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
            ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;

            if( coeff_abs > 1 )
            {
                x264_cabac_encode_decision( cb, ctx, 1 );
                ctx = levelgt1_ctx[node_ctx] + ctx_level;
                if( coeff_abs < 15 )
                {
                    cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
                    cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
                }
                else
                {
                    cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]];
                    cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]];
                    x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 );
                }
                node_ctx = coeff_abs_level_transition[1][node_ctx];
            }
            else
            {
                x264_cabac_encode_decision( cb, ctx, 0 );
                node_ctx = coeff_abs_level_transition[0][node_ctx];
                x264_cabac_encode_bypass( cb, 0 );
            }
        }
        else
            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
    }
}

static void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 1, 0 );
}
static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
    x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
}
static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 );
}
#endif

#define x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, b_dc, name )\
do\
{\
    int ctxidxinc = x264_cabac_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra, b_dc );\
    if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
    {\
        x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
        x264_cabac_block_residual##name( h, cb, ctx_block_cat, l );\
    }\
    else\
        x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
} while(0)

#define x264_cabac_block_residual_dc_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
    x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 1, )

#define x264_cabac_block_residual_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
    x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 0, )

#define x264_cabac_block_residual_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
    x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 0, _8x8 )

#define x264_cabac_block_residual_422_dc_cbf( h, cb, ch, b_intra )\
    x264_cabac_block_residual_cbf_internal( h, cb, DCT_CHROMA_DC, CHROMA_DC+(ch), h->dct.chroma_dc[ch], b_intra, 1, _422_dc )

static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma )
{
    const int i_mb_type = h->mb.i_type;

#if !RDO_SKIP_BS
    const int i_mb_pos_start = x264_cabac_pos( cb );
    int       i_mb_pos_tex;

    if( SLICE_MBAFF &&
        (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
    {
        x264_cabac_field_decoding_flag( h, cb );
    }
#endif

    if( h->sh.i_type == SLICE_TYPE_P )
        x264_cabac_mb_header_p( h, cb, i_mb_type, chroma );
    else if( h->sh.i_type == SLICE_TYPE_B )
        x264_cabac_mb_header_b( h, cb, i_mb_type, chroma );
    else //if( h->sh.i_type == SLICE_TYPE_I )
        x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_I, chroma );

#if !RDO_SKIP_BS
    i_mb_pos_tex = x264_cabac_pos( cb );
    h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;

    if( i_mb_type == I_PCM )
    {
        bs_t s;
        bs_init( &s, cb->p, cb->p_end - cb->p );

        for( int p = 0; p < plane_count; p++ )
            for( int i = 0; i < 256; i++ )
                bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
        if( chroma )
            for( int ch = 1; ch < 3; ch++ )
                for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )
                    for( int j = 0; j < 8; j++ )
                        bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );

        bs_flush( &s );
        cb->p = s.p;
        x264_cabac_encode_init_core( cb );

        h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
        return;
    }
#endif

    if( i_mb_type != I_16x16 )
    {
        x264_cabac_cbp_luma( h, cb );
        if( chroma )
            x264_cabac_cbp_chroma( h, cb );
    }

    if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
    {
        x264_cabac_transform_size( h, cb );
    }

    if( h->mb.i_cbp_luma || (chroma && h->mb.i_cbp_chroma) || i_mb_type == I_16x16 )
    {
        const int b_intra = IS_INTRA( i_mb_type );
        x264_cabac_qp_delta( h, cb );

        /* write residual */
        if( i_mb_type == I_16x16 )
        {
            /* DC Luma */
            for( int p = 0; p < plane_count; p++ )
            {
                x264_cabac_block_residual_dc_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );

                /* AC Luma */
                if( h->mb.i_cbp_luma )
                    for( int i = p*16; i < p*16+16; i++ )
                        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_AC][p], i, h->dct.luma4x4[i]+1, 1 );
            }
        }
        else if( h->mb.b_transform_8x8 )
        {
            if( plane_count == 3 )
            {
                ALIGNED_4( uint8_t nnzbak[3][8] );

/* Stupid nnz munging in the case that neighbors don't have
 * 8x8 transform enabled. */
#define BACKUP( dst, src, res )\
    dst = src;\
    src = res;

#define RESTORE( dst, src, res )\
    src = dst;

#define MUNGE_8x8_NNZ( MUNGE )\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\
{\
    MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\
    MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\
    MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\
    MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\
    MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\
    MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\
}\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\
{\
    MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\
    MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\
    MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\
    MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\
    MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\
    MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\
}\
if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\
{\
    MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\
    MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\
    MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\
}

                MUNGE_8x8_NNZ( BACKUP )

                for( int p = 0; p < 3; p++ )
                    for( int i = 0; i < 4; i++ )
                        if( h->mb.i_cbp_luma & ( 1 << i ) )
                            x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i*4+p*16, h->dct.luma8x8[i+p*4], b_intra );

                MUNGE_8x8_NNZ( RESTORE )
            }
            else
            {
                for( int i = 0; i < 4; i++ )
                    if( h->mb.i_cbp_luma & ( 1 << i ) )
                        x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i] );
            }
        }
        else
        {
            for( int p = 0; p < plane_count; p++ )
                for( int i = 0; i < 16; i++ )
                    if( h->mb.i_cbp_luma & ( 1 << ( i >> 2 ) ) )
                        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+p*16, h->dct.luma4x4[i+p*16], b_intra );
        }

        if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */
        {
            if( CHROMA_FORMAT == CHROMA_422 )
            {
                x264_cabac_block_residual_422_dc_cbf( h, cb, 0, b_intra );
                x264_cabac_block_residual_422_dc_cbf( h, cb, 1, b_intra );
            }
            else
            {
                x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
                x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
            }

            if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
            {
                int step = 8 << CHROMA_V_SHIFT;
                for( int i = 16; i < 3*16; i += step )
                    for( int j = i; j < i+4; j++ )
                        x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra );
            }
        }
    }

#if !RDO_SKIP_BS
    h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
#endif
}

void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
    if( CHROMA444 )
        x264_macroblock_write_cabac_internal( h, cb, 3, 0 );
    else
        x264_macroblock_write_cabac_internal( h, cb, 1, 1 );
}

#if RDO_SKIP_BS
/*****************************************************************************
 * RD only; doesn't generate a valid bitstream
 * doesn't write cbp or chroma dc (I don't know how much this matters)
 * doesn't write ref (never varies between calls, so no point in doing so)
 * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
 * works on all partition sizes except 16x16
 *****************************************************************************/
static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel )
{
    const int i_mb_type = h->mb.i_type;
    int b_8x16 = h->mb.i_partition == D_8x16;
    int plane_count = CHROMA444 ? 3 : 1;

    if( i_mb_type == P_8x8 )
    {
        x264_cabac_8x8_mvd( h, cb, i8 );
        x264_cabac_subpartition_p( cb, h->mb.i_sub_partition[i8] );
    }
    else if( i_mb_type == P_L0 )
        x264_cabac_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
    else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
    {
        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cabac_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
    }
    else //if( i_mb_type == B_8x8 )
    {
        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
            x264_cabac_mvd( h, cb, 0, 4*i8, 2, 2 );
        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
            x264_cabac_mvd( h, cb, 1, 4*i8, 2, 2 );
    }

    for( int j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
    {
        if( h->mb.i_cbp_luma & (1 << i8) )
        {
            if( h->mb.b_transform_8x8 )
            {
                if( CHROMA444 )
                    for( int p = 0; p < 3; p++ )
                        x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 0 );
                else
                    x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] );
            }
            else
                for( int p = 0; p < plane_count; p++ )
                    for( int i4 = 0; i4 < 4; i4++ )
                        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16], 0 );
        }

        if( h->mb.i_cbp_chroma )
        {
            if( CHROMA_FORMAT == CHROMA_422 )
            {
                int offset = (5*i8) & 0x09;
                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1, 0 );
                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1, 0 );
                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1, 0 );
                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1, 0 );
            }
            else
            {
                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
            }
        }

        i8 += x264_pixel_size[i_pixel].h >> 3;
    }
}

static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
{
    int b_8x4 = i_pixel == PIXEL_8x4;
    int plane_count = CHROMA444 ? 3 : 1;
    if( i_pixel == PIXEL_4x4 )
        x264_cabac_mvd( h, cb, 0, i4, 1, 1 );
    else
        x264_cabac_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
    for( int p = 0; p < plane_count; p++ )
    {
        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4, h->dct.luma4x4[p*16+i4], 0 );
        if( i_pixel != PIXEL_4x4 )
            x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4], 0 );
    }
}

static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_mode )
{
    const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 );
    i_mode = x264_mb_pred_mode4x4_fix( i_mode );
    x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode );
    x264_cabac_cbp_luma( h, cb );
    if( h->mb.i_cbp_luma & (1 << i8) )
    {
        if( CHROMA444 )
            for( int p = 0; p < 3; p++ )
                x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 1 );
        else
            x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] );
    }
}

static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode )
{
    const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
    int plane_count = CHROMA444 ? 3 : 1;
    i_mode = x264_mb_pred_mode4x4_fix( i_mode );
    x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode );
    for( int p = 0; p < plane_count; p++ )
        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 );
}

static void x264_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
{
    x264_cabac_intra_chroma_pred_mode( h, cb );
    x264_cabac_cbp_chroma( h, cb );
    if( h->mb.i_cbp_chroma )
    {
        if( CHROMA_FORMAT == CHROMA_422 )
        {
            x264_cabac_block_residual_422_dc_cbf( h, cb, 0, 1 );
            x264_cabac_block_residual_422_dc_cbf( h, cb, 1, 1 );
        }
        else
        {
            x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
            x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
        }

        if( h->mb.i_cbp_chroma == 2 )
        {
            int step = 8 << CHROMA_V_SHIFT;
            for( int i = 16; i < 3*16; i += step )
                for( int j = i; j < i+4; j++ )
                    x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 );
        }
    }
}
#endif
x264-snapshot-20120103-2245-stable/encoder/slicetype.c0000644000175000001440000022156011700673342021463 0ustar  videolanusers/*****************************************************************************
 * slicetype.c: lookahead analysis
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Dylan Yudaken <dyudaken@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "macroblock.h"
#include "me.h"

// Indexed by pic_struct values
static const uint8_t delta_tfi_divisor[10] = { 0, 2, 1, 1, 2, 2, 3, 3, 4, 6 };

static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                      x264_frame_t **frames, int p0, int p1, int b,
                                      int b_intra_penalty );

static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
{
    a->i_qp = X264_LOOKAHEAD_QP;
    a->i_lambda = x264_lambda_tab[ a->i_qp ];
    x264_mb_analyse_load_costs( h, a );
    if( h->param.analyse.i_subpel_refine > 1 )
    {
        h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method );
        h->mb.i_subpel_refine = 4;
    }
    else
    {
        h->mb.i_me_method = X264_ME_DIA;
        h->mb.i_subpel_refine = 2;
    }
    h->mb.b_chroma_me = 0;
}

/* makes a non-h264 weight (i.e. fix7), into an h264 weight */
static void x264_weight_get_h264( int weight_nonh264, int offset, x264_weight_t *w )
{
    w->i_offset = offset;
    w->i_denom = 7;
    w->i_scale = weight_nonh264;
    while( w->i_denom > 0 && (w->i_scale > 127 || !(w->i_scale & 1)) )
    {
        w->i_denom--;
        w->i_scale >>= 1;
    }
    w->i_scale = X264_MIN( w->i_scale, 127 );
}

static NOINLINE pixel *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dest )
{
    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
    /* Note: this will never run during lookahead as weights_analyse is only called if no
     * motion search has been done. */
    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
    {
        int i_stride = fenc->i_stride_lowres;
        int i_lines = fenc->i_lines_lowres;
        int i_width = fenc->i_width_lowres;
        int i_mb_xy = 0;
        pixel *p = dest;

        for( int y = 0; y < i_lines; y += 8, p += i_stride*8 )
            for( int x = 0; x < i_width; x += 8, i_mb_xy++ )
            {
                int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0];
                int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1];
                h->mc.mc_luma( p+x, i_stride, ref->lowres, i_stride,
                               mvx+(x<<2), mvy+(y<<2), 8, 8, x264_weight_none );
            }
        x264_emms();
        return dest;
    }
    x264_emms();
    return ref->lowres[0];
}

/* How data is organized for 4:2:0/4:2:2 chroma weightp:
 * [U: ref] [U: fenc]
 * [V: ref] [V: fenc]
 * fenc = ref + offset
 * v = u + stride * chroma height */

static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv )
{
    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
    int i_stride = fenc->i_stride[1];
    int i_offset = i_stride / 2;
    int i_lines = fenc->i_lines[1];
    int i_width = fenc->i_width[1];
    int v_shift = CHROMA_V_SHIFT;
    int cw = 8*h->mb.i_mb_width;
    int ch = 16*h->mb.i_mb_height >> v_shift;
    int height = 16 >> v_shift;

    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
    {
        x264_frame_expand_border_chroma( h, ref, 1 );
        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += height, pel_offset_y = y*i_stride )
            for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, mb_xy++, pel_offset_x += 8 )
            {
                pixel *pixu = dstu + pel_offset_y + pel_offset_x;
                pixel *pixv = dstv + pel_offset_y + pel_offset_x;
                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12/NV16 */
                int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0];
                int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1];
                h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, 2*mvy>>v_shift, 8, height );
            }
    }
    else
        h->mc.plane_copy_deinterleave( dstu, i_stride, dstv, i_stride, ref->plane[1], i_stride, cw, ch );
    h->mc.plane_copy_deinterleave( dstu+i_offset, i_stride, dstv+i_offset, i_stride, fenc->plane[1], i_stride, cw, ch );
    x264_emms();
}

static NOINLINE pixel *x264_weight_cost_init_chroma444( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dst, int p )
{
    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
    int i_stride = fenc->i_stride[p];
    int i_lines = fenc->i_lines[p];
    int i_width = fenc->i_width[p];

    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
    {
        x264_frame_expand_border_chroma( h, ref, p );
        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 16, pel_offset_y = y*i_stride )
            for( int x = 0, pel_offset_x = 0; x < i_width; x += 16, mb_xy++, pel_offset_x += 16 )
            {
                pixel *pix = dst + pel_offset_y + pel_offset_x;
                pixel *src = ref->plane[p] + pel_offset_y + pel_offset_x;
                int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0] / 2;
                int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1] / 2;
                /* We don't want to calculate hpels for fenc frames, so we round the motion
                 * vectors to fullpel here.  It's not too bad, I guess? */
                h->mc.copy_16x16_unaligned( pix, i_stride, src+mvx+mvy*i_stride, i_stride, 16 );
            }
        x264_emms();
        return dst;
    }
    x264_emms();
    return ref->plane[p];
}

static int x264_weight_slice_header_cost( x264_t *h, x264_weight_t *w, int b_chroma )
{
    /* Add cost of weights in the slice header. */
    int lambda = x264_lambda_tab[X264_LOOKAHEAD_QP];
    /* 4 times higher, because chroma is analyzed at full resolution. */
    if( b_chroma )
        lambda *= 4;
    int numslices;
    if( h->param.i_slice_count )
        numslices = h->param.i_slice_count;
    else if( h->param.i_slice_max_mbs )
        numslices = (h->mb.i_mb_width * h->mb.i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
    else
        numslices = 1;
    /* FIXME: find a way to account for --slice-max-size?
     * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
     * Cut denom cost in half if chroma, since it's shared between the two chroma planes. */
    int denom_cost = bs_size_ue( w[0].i_denom ) * (2 - b_chroma);
    return lambda * numslices * ( 10 + denom_cost + 2 * (bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset )) );
}

static NOINLINE unsigned int x264_weight_cost_luma( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w )
{
    unsigned int cost = 0;
    int i_stride = fenc->i_stride_lowres;
    int i_lines = fenc->i_lines_lowres;
    int i_width = fenc->i_width_lowres;
    pixel *fenc_plane = fenc->lowres[0];
    ALIGNED_ARRAY_16( pixel, buf,[8*8] );
    int pixoff = 0;
    int i_mb = 0;

    if( w )
    {
        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
            {
                w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
                int cmp = h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride );
                cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] );
            }
        cost += x264_weight_slice_header_cost( h, w, 0 );
    }
    else
        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
            {
                int cmp = h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride );
                cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] );
            }
    x264_emms();
    return cost;
}

static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w )
{
    unsigned int cost = 0;
    int i_stride = fenc->i_stride[1];
    int i_offset = i_stride / 2;
    int i_lines = fenc->i_lines[1];
    int i_width = fenc->i_width[1];
    pixel *src = ref + i_offset;
    ALIGNED_ARRAY_16( pixel, buf, [8*16] );
    int pixoff = 0;
    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
    int height = 16 >> CHROMA_V_SHIFT;
    ALIGNED_16( static pixel flat[8] ) = {0};
    if( w )
    {
        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 8, pixoff += 8 )
            {
                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, height );
                /* The naive and seemingly sensible algorithm is to use mbcmp as in luma.
                 * But testing shows that for chroma the DC coefficient is by far the most
                 * important part of the coding cost.  Thus a more useful chroma weight is
                 * obtained by comparing each block's DC coefficient instead of the actual
                 * pixels.
                 *
                 * FIXME: add a (faster) asm sum function to replace sad. */
                cost += abs( h->pixf.sad_aligned[chromapix](          buf,        8, flat, 0 ) -
                             h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
            }
        cost += x264_weight_slice_header_cost( h, w, 1 );
    }
    else
        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 8, pixoff += 8 )
                cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) -
                             h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
    x264_emms();
    return cost;
}

static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w, int p )
{
    unsigned int cost = 0;
    int i_stride = fenc->i_stride[p];
    int i_lines = fenc->i_lines[p];
    int i_width = fenc->i_width[p];
    pixel *src = fenc->plane[p];
    ALIGNED_ARRAY_16( pixel, buf, [16*16] );
    int pixoff = 0;
    if( w )
    {
        for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 16, pixoff += 16 )
            {
                w->weightfn[16>>2]( buf, 16, &ref[pixoff], i_stride, w, 16 );
                cost += h->pixf.mbcmp[PIXEL_16x16]( buf, 16, &src[pixoff], i_stride );
            }
        cost += x264_weight_slice_header_cost( h, w, 1 );
    }
    else
        for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 16, pixoff += 16 )
                cost += h->pixf.mbcmp[PIXEL_16x16]( &ref[pixoff], i_stride, &src[pixoff], i_stride );
    x264_emms();
    return cost;
}

static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
{
    int i_delta_index = fenc->i_frame - ref->i_frame - 1;
    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
    const float epsilon = 1.f/128.f;
    x264_weight_t *weights = fenc->weight[0];
    SET_WEIGHT( weights[0], 0, 1, 0, 0 );
    SET_WEIGHT( weights[1], 0, 1, 0, 0 );
    SET_WEIGHT( weights[2], 0, 1, 0, 0 );
    int chroma_initted = 0;
    /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
    for( int plane = 0; plane <= 2 && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
    {
        int cur_offset, start_offset, end_offset;
        int minoff, minscale, mindenom;
        unsigned int minscore, origscore;
        int found;
        float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
        float ref_var  =  ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
        float guess_scale = sqrtf( fenc_var / ref_var );
        float fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
        float ref_mean  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));

        //early termination
        if( fabsf( ref_mean - fenc_mean ) < 0.5f && fabsf( 1.f - guess_scale ) < epsilon )
        {
            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
            continue;
        }

        if( plane )
        {
            weights[plane].i_denom = 6;
            weights[plane].i_scale = x264_clip3( round( guess_scale * 64 ), 0, 255 );
            if( weights[plane].i_scale > 127 )
            {
                weights[1].weightfn = weights[2].weightfn = NULL;
                break;
            }
        }
        else
            x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] );

        found = 0;
        mindenom = weights[plane].i_denom;
        minscale = weights[plane].i_scale;
        minoff = 0;

        pixel *mcbuf;
        if( !plane )
        {
            if( !fenc->b_intra_calculated )
            {
                x264_mb_analysis_t a;
                x264_lowres_context_init( h, &a );
                x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
            }
            mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
            origscore = minscore = x264_weight_cost_luma( h, fenc, mcbuf, NULL );
        }
        else
        {
            if( CHROMA444 )
            {
                mcbuf = x264_weight_cost_init_chroma444( h, fenc, ref, h->mb.p_weight_buf[0], plane );
                origscore = minscore = x264_weight_cost_chroma444( h, fenc, mcbuf, NULL, plane );
            }
            else
            {
                pixel *dstu = h->mb.p_weight_buf[0];
                pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
                if( !chroma_initted++ )
                    x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
                mcbuf = plane == 1 ? dstu : dstv;
                origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, NULL );
            }
        }

        if( !minscore )
            continue;

        // This gives a slight improvement due to rounding errors but only tests one offset in lookahead.
        // Currently only searches within +/- 1 of the best offset found so far.
        // TODO: Try other offsets/multipliers/combinations thereof?
        cur_offset = fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f * b_lookahead;
        start_offset = x264_clip3( cur_offset - !b_lookahead, -128, 127 );
        end_offset   = x264_clip3( cur_offset + !b_lookahead, -128, 127 );
        for( int i_off = start_offset; i_off <= end_offset; i_off++ )
        {
            SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off );
            unsigned int s;
            if( plane )
            {
                if( CHROMA444 )
                    s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
                else
                    s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
            }
            else
                s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
            COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );

            // Don't check any more offsets if the previous one had a lower cost than the current one
            if( minoff == start_offset && i_off != start_offset )
                break;
        }
        x264_emms();

        /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
        /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
        if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
        {
            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
            continue;
        }
        else
            SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff );

        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn && !plane )
            fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
    }

    //FIXME, what is the correct way to deal with this?
    if( weights[1].weightfn && weights[2].weightfn && weights[1].i_denom != weights[2].i_denom )
    {
        int denom = X264_MIN( weights[1].i_denom, weights[2].i_denom );
        int i;
        for( i = 1; i <= 2; i++ )
        {
            weights[i].i_scale = x264_clip3( weights[i].i_scale >> ( weights[i].i_denom - denom ), 0, 255 );
            weights[i].i_denom = denom;
            h->mc.weight_cache( h, &weights[i] );
        }
    }

    if( weights[0].weightfn && b_lookahead )
    {
        //scale lowres in lookahead for slicetype_frame_cost
        pixel *src = ref->buffer_lowres[0];
        pixel *dst = h->mb.p_weight_buf[0];
        int width = ref->i_width_lowres + PADH*2;
        int height = ref->i_lines_lowres + PADV*2;
        x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
                                 width, height, &weights[0] );
        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
    }
}

static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
                                    x264_frame_t **frames, int p0, int p1, int b,
                                    int dist_scale_factor, int do_search[2], const x264_weight_t *w )
{
    x264_frame_t *fref0 = frames[p0];
    x264_frame_t *fref1 = frames[p1];
    x264_frame_t *fenc  = frames[b];
    const int b_bidir = (b < p1);
    const int i_mb_x = h->mb.i_mb_x;
    const int i_mb_y = h->mb.i_mb_y;
    const int i_mb_stride = h->mb.i_mb_width;
    const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride;
    const int i_stride = fenc->i_stride_lowres;
    const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride);
    const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
    int16_t (*fenc_mvs[2])[2] = { &fenc->lowres_mvs[0][b-p0-1][i_mb_xy], &fenc->lowres_mvs[1][p1-b-1][i_mb_xy] };
    int (*fenc_costs[2]) = { &fenc->lowres_mv_costs[0][b-p0-1][i_mb_xy], &fenc->lowres_mv_costs[1][p1-b-1][i_mb_xy] };
    int b_frame_score_mb = (i_mb_x > 0 && i_mb_x < h->mb.i_mb_width - 1 &&
                            i_mb_y > 0 && i_mb_y < h->mb.i_mb_height - 1) ||
                            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;

    ALIGNED_ARRAY_16( pixel, pix1,[9*FDEC_STRIDE] );
    pixel *pix2 = pix1+8;
    x264_me_t m[2];
    int i_bcost = COST_MAX;
    int list_used = 0;

    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
    h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, &fenc->lowres[0][i_pel_offset], i_stride, 8 );

    if( p0 == p1 )
        goto lowres_intra_mb;

    // no need for h->mb.mv_min[]
    h->mb.mv_min_fpel[0] = -8*h->mb.i_mb_x - 4;
    h->mb.mv_max_fpel[0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
    h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 );
    h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 );
    if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
    {
        h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4;
        h->mb.mv_max_fpel[1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
        h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 8 );
        h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 8 );
    }

#define LOAD_HPELS_LUMA(dst, src) \
    { \
        (dst)[0] = &(src)[0][i_pel_offset]; \
        (dst)[1] = &(src)[1][i_pel_offset]; \
        (dst)[2] = &(src)[2][i_pel_offset]; \
        (dst)[3] = &(src)[3][i_pel_offset]; \
    }
#define LOAD_WPELS_LUMA(dst,src) \
    (dst) = &(src)[i_pel_offset];

#define CLIP_MV( mv ) \
    { \
        mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \
        mv[1] = x264_clip3( mv[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] ); \
    }
#define TRY_BIDIR( mv0, mv1, penalty ) \
    { \
        int i_cost; \
        if( h->param.analyse.i_subpel_refine <= 1 ) \
        { \
            int hpel_idx1 = (((mv0)[0]&2)>>1) + ((mv0)[1]&2); \
            int hpel_idx2 = (((mv1)[0]&2)>>1) + ((mv1)[1]&2); \
            pixel *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \
            pixel *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \
            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, m[0].i_stride[0], src2, m[1].i_stride[0], i_bipred_weight ); \
        } \
        else \
        { \
            int stride1 = 16, stride2 = 16; \
            pixel *src1, *src2; \
            src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
                                  (mv0)[0], (mv0)[1], 8, 8, w ); \
            src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
                                  (mv1)[0], (mv1)[1], 8, 8, w ); \
            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
        } \
        i_cost = penalty * a->i_lambda + h->pixf.mbcmp[PIXEL_8x8]( \
                           m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
        COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
    }

    m[0].i_pixel = PIXEL_8x8;
    m[0].p_cost_mv = a->p_cost_mv;
    m[0].i_stride[0] = i_stride;
    m[0].p_fenc[0] = h->mb.pic.p_fenc[0];
    m[0].weight = w;
    m[0].i_ref = 0;
    LOAD_HPELS_LUMA( m[0].p_fref, fref0->lowres );
    m[0].p_fref_w = m[0].p_fref[0];
    if( w[0].weightfn )
        LOAD_WPELS_LUMA( m[0].p_fref_w, fenc->weighted[0] );

    if( b_bidir )
    {
        int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
        ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] );

        m[1].i_pixel = PIXEL_8x8;
        m[1].p_cost_mv = a->p_cost_mv;
        m[1].i_stride[0] = i_stride;
        m[1].p_fenc[0] = h->mb.pic.p_fenc[0];
        m[1].i_ref = 0;
        m[1].weight = x264_weight_none;
        LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
        m[1].p_fref_w = m[1].p_fref[0];

        dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
        dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
        dmv[1][0] = dmv[0][0] - mvr[0];
        dmv[1][1] = dmv[0][1] - mvr[1];
        CLIP_MV( dmv[0] );
        CLIP_MV( dmv[1] );
        if( h->param.analyse.i_subpel_refine <= 1 )
            M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */

        TRY_BIDIR( dmv[0], dmv[1], 0 );
        if( M64( dmv ) )
        {
            int i_cost;
            h->mc.avg[PIXEL_8x8]( pix1, 16, m[0].p_fref[0], m[0].i_stride[0], m[1].p_fref[0], m[1].i_stride[0], i_bipred_weight );
            i_cost = h->pixf.mbcmp[PIXEL_8x8]( m[0].p_fenc[0], FENC_STRIDE, pix1, 16 );
            COPY2_IF_LT( i_bcost, i_cost, list_used, 3 );
        }
    }

    for( int l = 0; l < 1 + b_bidir; l++ )
    {
        if( do_search[l] )
        {
            int i_mvc = 0;
            int16_t (*fenc_mv)[2] = fenc_mvs[l];
            ALIGNED_4( int16_t mvc[4][2] );

            /* Reverse-order MV prediction. */
            M32( mvc[0] ) = 0;
            M32( mvc[2] ) = 0;
#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
            if( i_mb_x < h->mb.i_mb_width - 1 )
                MVC( fenc_mv[1] );
            if( i_mb_y < h->mb.i_mb_height - 1 )
            {
                MVC( fenc_mv[i_mb_stride] );
                if( i_mb_x > 0 )
                    MVC( fenc_mv[i_mb_stride-1] );
                if( i_mb_x < h->mb.i_mb_width - 1 )
                    MVC( fenc_mv[i_mb_stride+1] );
            }
#undef MVC
            if( i_mvc <= 1 )
                CP32( m[l].mvp, mvc[0] );
            else
                x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );

            /* Fast skip for cases of near-zero residual.  Shortcut: don't bother except in the mv0 case,
             * since anything else is likely to have enough residual to not trigger the skip. */
            if( !M32( m[l].mvp ) )
            {
                m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
                if( m[l].cost < 64 )
                {
                    M32( m[l].mv ) = 0;
                    goto skip_motionest;
                }
            }

            x264_me_search( h, &m[l], mvc, i_mvc );
            m[l].cost -= a->p_cost_mv[0]; // remove mvcost from skip mbs
            if( M32( m[l].mv ) )
                m[l].cost += 5 * a->i_lambda;

skip_motionest:
            CP32( fenc_mvs[l], m[l].mv );
            *fenc_costs[l] = m[l].cost;
        }
        else
        {
            CP32( m[l].mv, fenc_mvs[l] );
            m[l].cost = *fenc_costs[l];
        }
        COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
    }

    if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
        TRY_BIDIR( m[0].mv, m[1].mv, 5 );

lowres_intra_mb:
    if( !fenc->b_intra_calculated )
    {
        ALIGNED_ARRAY_16( pixel, edge,[36] );
        pixel *pix = &pix1[8+FDEC_STRIDE - 1];
        pixel *src = &fenc->lowres[0][i_pel_offset - 1];
        const int intra_penalty = 5 * a->i_lambda;
        int satds[3];

        memcpy( pix-FDEC_STRIDE, src-i_stride, 17 * sizeof(pixel) );
        for( int i = 0; i < 8; i++ )
            pix[i*FDEC_STRIDE] = src[i*i_stride];
        pix++;

        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
        int i_icost = X264_MIN3( satds[0], satds[1], satds[2] );

        if( h->param.analyse.i_subpel_refine > 1 )
        {
            h->predict_8x8c[I_PRED_CHROMA_P]( pix );
            int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
            i_icost = X264_MIN( i_icost, satd );
            h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
            for( int i = 3; i < 9; i++ )
            {
                h->predict_8x8[i]( pix, edge );
                satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
                i_icost = X264_MIN( i_icost, satd );
            }
        }

        i_icost += intra_penalty;
        fenc->i_intra_cost[i_mb_xy] = i_icost;
        int i_icost_aq = i_icost;
        if( h->param.rc.i_aq_mode )
            i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
        fenc->i_row_satds[0][0][h->mb.i_mb_y] += i_icost_aq;
        if( b_frame_score_mb )
        {
            fenc->i_cost_est[0][0] += i_icost;
            fenc->i_cost_est_aq[0][0] += i_icost_aq;
        }
    }

    /* forbid intra-mbs in B-frames, because it's rare and not worth checking */
    /* FIXME: Should we still forbid them now that we cache intra scores? */
    if( !b_bidir )
    {
        int i_icost = fenc->i_intra_cost[i_mb_xy];
        int b_intra = i_icost < i_bcost;
        if( b_intra )
        {
            i_bcost = i_icost;
            list_used = 0;
        }
        if( b_frame_score_mb )
            fenc->i_intra_mbs[b-p0] += b_intra;
    }

    /* In an I-frame, we've already added the results above in the intra section. */
    if( p0 != p1 )
    {
        int i_bcost_aq = i_bcost;
        if( h->param.rc.i_aq_mode )
            i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
        fenc->i_row_satds[b-p0][p1-b][h->mb.i_mb_y] += i_bcost_aq;
        if( b_frame_score_mb )
        {
            /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
            fenc->i_cost_est[b-p0][p1-b] += i_bcost;
            fenc->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq;
        }
    }

    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = X264_MIN( i_bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
}
#undef TRY_BIDIR

#define NUM_MBS\
   (h->mb.i_mb_width > 2 && h->mb.i_mb_height > 2 ?\
   (h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\
    h->mb.i_mb_width * h->mb.i_mb_height)

static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                      x264_frame_t **frames, int p0, int p1, int b,
                                      int b_intra_penalty )
{
    int i_score = 0;
    int do_search[2];
    const x264_weight_t *w = x264_weight_none;
    /* Check whether we already evaluated this frame
     * If we have tried this frame as P, then we have also tried
     * the preceding frames as B. (is this still true?) */
    /* Also check that we already calculated the row SATDs for the current frame. */
    if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || frames[b]->i_row_satds[b-p0][p1-b][0] != -1) )
        i_score = frames[b]->i_cost_est[b-p0][p1-b];
    else
    {
        int dist_scale_factor = 128;
        int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
        int *row_satd_intra = frames[b]->i_row_satds[0][0];

        /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
        if( do_search[0] )
        {
            if( h->param.analyse.i_weighted_pred && b == p1 )
            {
                x264_emms();
                x264_weights_analyse( h, frames[b], frames[p0], 1 );
                w = frames[b]->weight[0];
            }
            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
        }
        if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;

        if( b == p1 )
            frames[b]->i_intra_mbs[b-p0] = 0;
        if( !frames[b]->b_intra_calculated )
        {
            frames[b]->i_cost_est[0][0] = 0;
            frames[b]->i_cost_est_aq[0][0] = 0;
        }
        if( p1 != p0 )
            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);

        frames[b]->i_cost_est[b-p0][p1-b] = 0;
        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;

        /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
         * This considerably improves MV prediction overall. */

        /* The edge mbs seem to reduce the predictive quality of the
         * whole frame's score, but are needed for a spatial distribution. */
        if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
        {
            for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
            {
                row_satd[h->mb.i_mb_y] = 0;
                if( !frames[b]->b_intra_calculated )
                    row_satd_intra[h->mb.i_mb_y] = 0;
                for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
            }
        }
        else
        {
            for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
                for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
        }

        i_score = frames[b]->i_cost_est[b-p0][p1-b];
        if( b != p1 )
            i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
        else
            frames[b]->b_intra_calculated = 1;

        frames[b]->i_cost_est[b-p0][p1-b] = i_score;
        x264_emms();
    }

    if( b_intra_penalty )
    {
        // arbitrary penalty for I-blocks after B-frames
        int nmb = NUM_MBS;
        i_score += (uint64_t)i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8);
    }
    return i_score;
}

/* If MB-tree changes the quantizers, we need to recalculate the frame cost without
 * re-running lookahead. */
static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b )
{
    int i_score = 0;
    int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
    float *qp_offset = IS_X264_TYPE_B(frames[b]->i_type) ? frames[b]->f_qp_offset_aq : frames[b]->f_qp_offset;
    x264_emms();
    for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
    {
        row_satd[ h->mb.i_mb_y ] = 0;
        for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
        {
            int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
            int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK;
            float qp_adj = qp_offset[i_mb_xy];
            i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8;
            row_satd[ h->mb.i_mb_y ] += i_mb_cost;
            if( (h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->mb.i_mb_height - 1 &&
                 h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->mb.i_mb_width - 1) ||
                 h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
            {
                i_score += i_mb_cost;
            }
        }
    }
    return i_score;
}

static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
{
    int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 );
    float weightdelta = 0.0;
    if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
        weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);

    /* Allow the strength to be adjusted via qcompress, since the two
     * concepts are very similar. */
    float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
    for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ )
    {
        int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8;
        if( intra_cost )
        {
            int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor + 128) >> 8;
            float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta;
            frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
        }
    }
}

static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, float average_duration, int p0, int p1, int b, int referenced )
{
    uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
    int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
    int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
    int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
    int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
    int *buf = h->scratch_buffer;
    uint16_t *propagate_cost = frames[b]->i_propagate_cost;

    x264_emms();
    float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration);

    /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
    if( !referenced )
        memset( frames[b]->i_propagate_cost, 0, h->mb.i_mb_width * sizeof(uint16_t) );

    for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->mb.i_mb_height; h->mb.i_mb_y++ )
    {
        int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
        h->mc.mbtree_propagate_cost( buf, propagate_cost,
            frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
            frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
        if( referenced )
            propagate_cost += h->mb.i_mb_width;
        for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
        {
            int propagate_amount = buf[h->mb.i_mb_x];
            /* Don't propagate for an intra block. */
            if( propagate_amount > 0 )
            {
                /* Access width-2 bitfield. */
                int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
                /* Follow the MVs to the previous frame(s). */
                for( int list = 0; list < 2; list++ )
                    if( (lists_used >> list)&1 )
                    {
#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1)
                        int listamount = propagate_amount;
                        /* Apply bipred weighting. */
                        if( lists_used == 3 )
                            listamount = (listamount * bipred_weights[list] + 32) >> 6;

                        /* Early termination for simple case of mv0. */
                        if( !M32( mvs[list][mb_index] ) )
                        {
                            CLIP_ADD( ref_costs[list][mb_index], listamount );
                            continue;
                        }

                        int x = mvs[list][mb_index][0];
                        int y = mvs[list][mb_index][1];
                        int mbx = (x>>5)+h->mb.i_mb_x;
                        int mby = (y>>5)+h->mb.i_mb_y;
                        int idx0 = mbx + mby * h->mb.i_mb_stride;
                        int idx1 = idx0 + 1;
                        int idx2 = idx0 + h->mb.i_mb_stride;
                        int idx3 = idx0 + h->mb.i_mb_stride + 1;
                        x &= 31;
                        y &= 31;
                        int idx0weight = (32-y)*(32-x);
                        int idx1weight = (32-y)*x;
                        int idx2weight = y*(32-x);
                        int idx3weight = y*x;

                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
                         * be counted. */
                        if( mbx < h->mb.i_mb_width-1 && mby < h->mb.i_mb_height-1 && mbx >= 0 && mby >= 0 )
                        {
                            CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
                            CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
                            CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
                            CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
                        }
                        else /* Check offsets individually */
                        {
                            if( mbx < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx >= 0 && mby >= 0 )
                                CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
                            if( mbx+1 < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx+1 >= 0 && mby >= 0 )
                                CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
                            if( mbx < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx >= 0 && mby+1 >= 0 )
                                CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
                            if( mbx+1 < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx+1 >= 0 && mby+1 >= 0 )
                                CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
                        }
                    }
            }
        }
    }

    if( h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead && referenced )
        x264_macroblock_tree_finish( h, frames[b], average_duration, b == p1 ? b - p0 : 0 );
}

static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
{
    int idx = !b_intra;
    int last_nonb, cur_nonb = 1;
    int bframes = 0;

    x264_emms();
    float total_duration = 0.0;
    for( int j = 0; j <= num_frames; j++ )
        total_duration += frames[j]->f_duration;
    float average_duration = total_duration / (num_frames + 1);

    int i = num_frames;

    if( b_intra )
        x264_slicetype_frame_cost( h, a, frames, 0, 0, 0, 0 );

    while( i > 0 && frames[i]->i_type == X264_TYPE_B )
        i--;
    last_nonb = i;

    /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
     * be applied to the end of a lookahead buffer of any size.  However, it's most needed when
     * lookahead=0, so that's what's currently implemented. */
    if( !h->param.rc.i_lookahead )
    {
        if( b_intra )
        {
            memset( frames[0]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
            memcpy( frames[0]->f_qp_offset, frames[0]->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
            return;
        }
        XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
        memset( frames[0]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
    }
    else
    {
        if( last_nonb < idx )
            return;
        memset( frames[last_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
    }

    while( i-- > idx )
    {
        cur_nonb = i;
        while( frames[cur_nonb]->i_type == X264_TYPE_B && cur_nonb > 0 )
            cur_nonb--;
        if( cur_nonb < idx )
            break;
        x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, last_nonb, 0 );
        memset( frames[cur_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
        bframes = last_nonb - cur_nonb - 1;
        if( h->param.i_bframe_pyramid && bframes > 1 )
        {
            int middle = (bframes + 1)/2 + cur_nonb;
            x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, middle, 0 );
            memset( frames[middle]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
            while( i > cur_nonb )
            {
                int p0 = i > middle ? middle : cur_nonb;
                int p1 = i < middle ? middle : last_nonb;
                if( i != middle )
                {
                    x264_slicetype_frame_cost( h, a, frames, p0, p1, i, 0 );
                    x264_macroblock_tree_propagate( h, frames, average_duration, p0, p1, i, 0 );
                }
                i--;
            }
            x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, middle, 1 );
        }
        else
        {
            while( i > cur_nonb )
            {
                x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 );
                x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, i, 0 );
                i--;
            }
        }
        x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, last_nonb, 1 );
        last_nonb = cur_nonb;
    }

    if( !h->param.rc.i_lookahead )
    {
        x264_macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 );
        XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
    }

    x264_macroblock_tree_finish( h, frames[last_nonb], average_duration, last_nonb );
    if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size )
        x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], average_duration, 0 );
}

static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
{
    int cost = x264_slicetype_frame_cost( h, a, frames, p0, p1, b, 0 );
    if( h->param.rc.i_aq_mode )
    {
        if( h->param.rc.b_mb_tree )
            return x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
        else
            return frames[b]->i_cost_est_aq[b-p0][p1-b];
    }
    return cost;
}

static void x264_calculate_durations( x264_t *h, x264_frame_t *cur_frame, x264_frame_t *prev_frame, int64_t *i_cpb_delay, int64_t *i_coded_fields )
{
    cur_frame->i_cpb_delay = *i_cpb_delay;
    cur_frame->i_dpb_output_delay = cur_frame->i_field_cnt - *i_coded_fields;

    // add a correction term for frame reordering
    cur_frame->i_dpb_output_delay += h->sps->vui.i_num_reorder_frames*2;

    // fix possible negative dpb_output_delay because of pulldown changes and reordering
    if( cur_frame->i_dpb_output_delay < 0 )
    {
        cur_frame->i_cpb_delay += cur_frame->i_dpb_output_delay;
        cur_frame->i_dpb_output_delay = 0;
        if( prev_frame )
            prev_frame->i_cpb_duration += cur_frame->i_dpb_output_delay;
    }

    if( cur_frame->b_keyframe )
        *i_cpb_delay = 0;

    *i_cpb_delay += cur_frame->i_duration;
    *i_coded_fields += cur_frame->i_duration;
    cur_frame->i_cpb_duration = cur_frame->i_duration;
}

static void x264_vbv_lookahead( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int keyframe )
{
    int last_nonb = 0, cur_nonb = 1, idx = 0;
    x264_frame_t *prev_frame = NULL;
    int prev_frame_idx = 0;
    while( cur_nonb < num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
        cur_nonb++;
    int next_nonb = keyframe ? last_nonb : cur_nonb;

    if( frames[cur_nonb]->i_coded_fields_lookahead >= 0 )
    {
        h->i_coded_fields_lookahead = frames[cur_nonb]->i_coded_fields_lookahead;
        h->i_cpb_delay_lookahead = frames[cur_nonb]->i_cpb_delay_lookahead;
    }

    while( cur_nonb < num_frames )
    {
        /* P/I cost: This shouldn't include the cost of next_nonb */
        if( next_nonb != cur_nonb )
        {
            int p0 = IS_X264_TYPE_I( frames[cur_nonb]->i_type ) ? cur_nonb : last_nonb;
            frames[next_nonb]->i_planned_satd[idx] = x264_vbv_frame_cost( h, a, frames, p0, cur_nonb, cur_nonb );
            frames[next_nonb]->i_planned_type[idx] = frames[cur_nonb]->i_type;
            frames[cur_nonb]->i_coded_fields_lookahead = h->i_coded_fields_lookahead;
            frames[cur_nonb]->i_cpb_delay_lookahead = h->i_cpb_delay_lookahead;
            x264_calculate_durations( h, frames[cur_nonb], prev_frame, &h->i_cpb_delay_lookahead, &h->i_coded_fields_lookahead );
            if( prev_frame )
            {
                frames[next_nonb]->f_planned_cpb_duration[prev_frame_idx] = (double)prev_frame->i_cpb_duration *
                                                                            h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
            }
            frames[next_nonb]->f_planned_cpb_duration[idx] = (double)frames[cur_nonb]->i_cpb_duration *
                                                             h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
            prev_frame = frames[cur_nonb];
            prev_frame_idx = idx;
            idx++;
        }
        /* Handle the B-frames: coded order */
        for( int i = last_nonb+1; i < cur_nonb; i++, idx++ )
        {
            frames[next_nonb]->i_planned_satd[idx] = x264_vbv_frame_cost( h, a, frames, last_nonb, cur_nonb, i );
            frames[next_nonb]->i_planned_type[idx] = X264_TYPE_B;
            frames[i]->i_coded_fields_lookahead = h->i_coded_fields_lookahead;
            frames[i]->i_cpb_delay_lookahead = h->i_cpb_delay_lookahead;
            x264_calculate_durations( h, frames[i], prev_frame, &h->i_cpb_delay_lookahead, &h->i_coded_fields_lookahead );
            if( prev_frame )
            {
                frames[next_nonb]->f_planned_cpb_duration[prev_frame_idx] = (double)prev_frame->i_cpb_duration *
                                                                            h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
            }
            frames[next_nonb]->f_planned_cpb_duration[idx] = (double)frames[i]->i_cpb_duration *
                                                             h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
            prev_frame = frames[i];
            prev_frame_idx = idx;
        }
        last_nonb = cur_nonb;
        cur_nonb++;
        while( cur_nonb <= num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
            cur_nonb++;
    }
    frames[next_nonb]->i_planned_type[idx] = X264_TYPE_AUTO;
}

static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, char *path, int threshold )
{
    int loc = 1;
    int cost = 0;
    int cur_p = 0;
    path--; /* Since the 1st path element is really the second frame */
    while( path[loc] )
    {
        int next_p = loc;
        /* Find the location of the next P-frame. */
        while( path[next_p] != 'P' )
            next_p++;

        /* Add the cost of the P-frame found above */
        cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_p, 0 );
        /* Early terminate if the cost we have found is larger than the best path cost so far */
        if( cost > threshold )
            break;

        if( h->param.i_bframe_pyramid && next_p - cur_p > 2 )
        {
            int middle = cur_p + (next_p - cur_p)/2;
            cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, middle, 0 );
            for( int next_b = loc; next_b < middle && cost < threshold; next_b++ )
                cost += x264_slicetype_frame_cost( h, a, frames, cur_p, middle, next_b, 0 );
            for( int next_b = middle+1; next_b < next_p && cost < threshold; next_b++ )
                cost += x264_slicetype_frame_cost( h, a, frames, middle, next_p, next_b, 0 );
        }
        else
            for( int next_b = loc; next_b < next_p && cost < threshold; next_b++ )
                cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 );

        loc = next_p + 1;
        cur_p = next_p;
    }
    return cost;
}

/* Viterbi/trellis slicetype decision algorithm. */
/* Uses strings due to the fact that the speed of the control functions is
   negligible compared to the cost of running slicetype_frame_cost, and because
   it makes debugging easier. */
static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, char (*best_paths)[X264_LOOKAHEAD_MAX+1] )
{
    char paths[2][X264_LOOKAHEAD_MAX+1];
    int num_paths = X264_MIN( h->param.i_bframe+1, length );
    int best_cost = COST_MAX;
    int idx = 0;

    /* Iterate over all currently possible paths */
    for( int path = 0; path < num_paths; path++ )
    {
        /* Add suffixes to the current path */
        int len = length - (path + 1);
        memcpy( paths[idx], best_paths[len % (X264_BFRAME_MAX+1)], len );
        memset( paths[idx]+len, 'B', path );
        strcpy( paths[idx]+len+path, "P" );

        /* Calculate the actual cost of the current path */
        int cost = x264_slicetype_path_cost( h, a, frames, paths[idx], best_cost );
        if( cost < best_cost )
        {
            best_cost = cost;
            idx ^= 1;
        }
    }

    /* Store the best path. */
    memcpy( best_paths[length % (X264_BFRAME_MAX+1)], paths[idx^1], length );
}

static int scenecut_internal( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int real_scenecut )
{
    x264_frame_t *frame = frames[p1];

    /* Don't do scenecuts on the right view of a frame-packed video. */
    if( real_scenecut && h->param.i_frame_packing == 5 && (frame->i_frame&1) )
        return 0;

    x264_slicetype_frame_cost( h, a, frames, p0, p1, p1, 0 );

    int icost = frame->i_cost_est[0][0];
    int pcost = frame->i_cost_est[p1-p0][0];
    float f_bias;
    int i_gop_size = frame->i_frame - h->lookahead->i_last_keyframe;
    float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
    /* magic numbers pulled out of thin air */
    float f_thresh_min = f_thresh_max * 0.25;
    int res;

    if( h->param.i_keyint_min == h->param.i_keyint_max )
        f_thresh_min = f_thresh_max;
    if( i_gop_size <= h->param.i_keyint_min / 4 || h->param.b_intra_refresh )
        f_bias = f_thresh_min / 4;
    else if( i_gop_size <= h->param.i_keyint_min )
        f_bias = f_thresh_min * i_gop_size / h->param.i_keyint_min;
    else
    {
        f_bias = f_thresh_min
                 + ( f_thresh_max - f_thresh_min )
                 * ( i_gop_size - h->param.i_keyint_min )
                 / ( h->param.i_keyint_max - h->param.i_keyint_min );
    }

    res = pcost >= (1.0 - f_bias) * icost;
    if( res && real_scenecut )
    {
        int imb = frame->i_intra_mbs[p1-p0];
        int pmb = NUM_MBS - imb;
        x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
                  frame->i_frame,
                  icost, pcost, 1. - (double)pcost / icost,
                  f_bias, i_gop_size, imb, pmb );
    }
    return res;
}

static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int real_scenecut, int num_frames, int i_max_search )
{
    /* Only do analysis during a normal scenecut check. */
    if( real_scenecut && h->param.i_bframe )
    {
        int origmaxp1 = p0 + 1;
        /* Look ahead to avoid coding short flashes as scenecuts. */
        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
            /* Don't analyse any more frames than the trellis would have covered. */
            origmaxp1 += h->param.i_bframe;
        else
            origmaxp1++;
        int maxp1 = X264_MIN( origmaxp1, num_frames );

        /* Where A and B are scenes: AAAAAABBBAAAAAA
         * If BBB is shorter than (maxp1-p0), it is detected as a flash
         * and not considered a scenecut. */
        for( int curp1 = p1; curp1 <= maxp1; curp1++ )
            if( !scenecut_internal( h, a, frames, p0, curp1, 0 ) )
                /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
                for( int i = curp1; i > p0; i-- )
                    frames[i]->b_scenecut = 0;

        /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
         * If each of BB ... EE are shorter than (maxp1-p0), they are
         * detected as flashes and not considered scenecuts.
         * Instead, the first F frame becomes a scenecut.
         * If the video ends before F, no frame becomes a scenecut. */
        for( int curp0 = p0; curp0 <= maxp1; curp0++ )
            if( origmaxp1 > i_max_search || (curp0 < maxp1 && scenecut_internal( h, a, frames, curp0, maxp1, 0 )) )
                /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
                    frames[curp0]->b_scenecut = 0;
    }

    /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */
    if( !frames[p1]->b_scenecut )
        return 0;
    return scenecut_internal( h, a, frames, p0, p1, real_scenecut );
}

void x264_slicetype_analyse( x264_t *h, int keyframe )
{
    x264_mb_analysis_t a;
    x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
    int num_frames, orig_num_frames, keyint_limit, framecnt;
    int i_mb_count = NUM_MBS;
    int cost1p0, cost2p0, cost1b1, cost2p1;
    int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX );
    int vbv_lookahead = h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead;
    if( h->param.b_deterministic )
        i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + !keyframe );

    assert( h->frames.b_have_lowres );

    if( !h->lookahead->last_nonb )
        return;
    frames[0] = h->lookahead->last_nonb;
    for( framecnt = 0; framecnt < i_max_search && h->lookahead->next.list[framecnt]->i_type == X264_TYPE_AUTO; framecnt++ )
        frames[framecnt+1] = h->lookahead->next.list[framecnt];

    x264_lowres_context_init( h, &a );

    if( !framecnt )
    {
        if( h->param.rc.b_mb_tree )
            x264_macroblock_tree( h, &a, frames, 0, keyframe );
        return;
    }

    keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->lookahead->i_last_keyframe - 1;
    orig_num_frames = num_frames = h->param.b_intra_refresh ? framecnt : X264_MIN( framecnt, keyint_limit );

    /* This is important psy-wise: if we have a non-scenecut keyframe,
     * there will be significant visual artifacts if the frames just before
     * go down in quality due to being referenced less, despite it being
     * more RD-optimal. */
    if( (h->param.analyse.b_psy && h->param.rc.b_mb_tree) || vbv_lookahead )
        num_frames = framecnt;
    else if( h->param.b_open_gop && num_frames < framecnt )
        num_frames++;
    else if( num_frames == 0 )
    {
        frames[1]->i_type = X264_TYPE_I;
        return;
    }

    int num_bframes = 0;
    int num_analysed_frames = num_frames;
    int reset_start;
    if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames, i_max_search ) )
    {
        frames[1]->i_type = X264_TYPE_I;
        return;
    }

    if( h->param.i_bframe )
    {
        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
        {
            if( num_frames > 1 )
            {
                char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX+1] = {"","P"};
                int best_path_index = num_frames % (X264_BFRAME_MAX+1);

                /* Perform the frametype analysis. */
                for( int j = 2; j <= num_frames; j++ )
                    x264_slicetype_path( h, &a, frames, j, best_paths );

                num_bframes = strspn( best_paths[best_path_index], "B" );
                /* Load the results of the analysis into the frame types. */
                for( int j = 1; j < num_frames; j++ )
                    frames[j]->i_type = best_paths[best_path_index][j-1] == 'B' ? X264_TYPE_B : X264_TYPE_P;
            }
            frames[num_frames]->i_type = X264_TYPE_P;
        }
        else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
        {
            for( int i = 0; i <= num_frames-2; )
            {
                cost2p1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+2, 1 );
                if( frames[i+2]->i_intra_mbs[2] > i_mb_count / 2 )
                {
                    frames[i+1]->i_type = X264_TYPE_P;
                    frames[i+2]->i_type = X264_TYPE_P;
                    i += 2;
                    continue;
                }

                cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
                cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
                cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );

                if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
                {
                    frames[i+1]->i_type = X264_TYPE_P;
                    i += 1;
                    continue;
                }

                // arbitrary and untuned
                #define INTER_THRESH 300
                #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
                frames[i+1]->i_type = X264_TYPE_B;

                int j;
                for( j = i+2; j <= X264_MIN( i+h->param.i_bframe, num_frames-1 ); j++ )
                {
                    int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-i-1), INTER_THRESH/10);
                    int pcost = x264_slicetype_frame_cost( h, &a, frames, i+0, j+1, j+1, 1 );
                    if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j-i+1] > i_mb_count/3 )
                        break;
                    frames[j]->i_type = X264_TYPE_B;
                }
                frames[j]->i_type = X264_TYPE_P;
                i = j;
            }
            frames[num_frames]->i_type = X264_TYPE_P;
            num_bframes = 0;
            while( num_bframes < num_frames && frames[num_bframes+1]->i_type == X264_TYPE_B )
                num_bframes++;
        }
        else
        {
            num_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
            for( int j = 1; j < num_frames; j++ )
                frames[j]->i_type = (j%(num_bframes+1)) ? X264_TYPE_B : X264_TYPE_P;
            frames[num_frames]->i_type = X264_TYPE_P;
        }

        /* Check scenecut on the first minigop. */
        for( int j = 1; j < num_bframes+1; j++ )
            if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames, i_max_search ) )
            {
                frames[j]->i_type = X264_TYPE_P;
                num_analysed_frames = j;
                break;
            }

        reset_start = keyframe ? 1 : X264_MIN( num_bframes+2, num_analysed_frames+1 );
    }
    else
    {
        for( int j = 1; j <= num_frames; j++ )
            frames[j]->i_type = X264_TYPE_P;
        reset_start = !keyframe + 1;
        num_bframes = 0;
    }

    /* Perform the actual macroblock tree analysis.
     * Don't go farther than the maximum keyframe interval; this helps in short GOPs. */
    if( h->param.rc.b_mb_tree )
        x264_macroblock_tree( h, &a, frames, X264_MIN(num_frames, h->param.i_keyint_max), keyframe );

    /* Enforce keyframe limit. */
    if( !h->param.b_intra_refresh )
        for( int i = keyint_limit+1; i <= num_frames; i += h->param.i_keyint_max )
        {
            frames[i]->i_type = X264_TYPE_I;
            reset_start = X264_MIN( reset_start, i+1 );
            if( h->param.b_open_gop && h->param.b_bluray_compat )
                while( IS_X264_TYPE_B( frames[i-1]->i_type ) )
                    i--;
        }

    if( vbv_lookahead )
        x264_vbv_lookahead( h, &a, frames, num_frames, keyframe );

    /* Restore frametypes for all frames that haven't actually been decided yet. */
    for( int j = reset_start; j <= num_frames; j++ )
        frames[j]->i_type = X264_TYPE_AUTO;
}

void x264_slicetype_decide( x264_t *h )
{
    x264_frame_t *frames[X264_BFRAME_MAX+2];
    x264_frame_t *frm;
    int bframes;
    int brefs;

    if( !h->lookahead->next.i_size )
        return;

    int lookahead_size = h->lookahead->next.i_size;

    for( int i = 0; i < h->lookahead->next.i_size; i++ )
    {
        if( h->param.b_vfr_input )
        {
            if( lookahead_size-- > 1 )
                h->lookahead->next.list[i]->i_duration = 2 * (h->lookahead->next.list[i+1]->i_pts - h->lookahead->next.list[i]->i_pts);
            else
                h->lookahead->next.list[i]->i_duration = h->i_prev_duration;
        }
        else
            h->lookahead->next.list[i]->i_duration = delta_tfi_divisor[h->lookahead->next.list[i]->i_pic_struct];
        h->i_prev_duration = h->lookahead->next.list[i]->i_duration;
        h->lookahead->next.list[i]->f_duration = (double)h->lookahead->next.list[i]->i_duration
                                               * h->sps->vui.i_num_units_in_tick
                                               / h->sps->vui.i_time_scale;

        if( h->lookahead->next.list[i]->i_frame > h->i_disp_fields_last_frame && lookahead_size > 0 )
        {
            h->lookahead->next.list[i]->i_field_cnt = h->i_disp_fields;
            h->i_disp_fields += h->lookahead->next.list[i]->i_duration;
            h->i_disp_fields_last_frame = h->lookahead->next.list[i]->i_frame;
        }
        else if( lookahead_size == 0 )
        {
            h->lookahead->next.list[i]->i_field_cnt = h->i_disp_fields;
            h->lookahead->next.list[i]->i_duration = h->i_prev_duration;
        }
    }

    if( h->param.rc.b_stat_read )
    {
        /* Use the frame types from the first pass */
        for( int i = 0; i < h->lookahead->next.i_size; i++ )
            h->lookahead->next.list[i]->i_type =
                x264_ratecontrol_slice_type( h, h->lookahead->next.list[i]->i_frame );
    }
    else if( (h->param.i_bframe && h->param.i_bframe_adaptive)
             || h->param.i_scenecut_threshold
             || h->param.rc.b_mb_tree
             || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead) )
        x264_slicetype_analyse( h, 0 );

    for( bframes = 0, brefs = 0;; bframes++ )
    {
        frm = h->lookahead->next.list[bframes];
        if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid < X264_B_PYRAMID_NORMAL &&
            brefs == h->param.i_bframe_pyramid )
        {
            frm->i_type = X264_TYPE_B;
            x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s \n",
                      frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid] );
        }
        /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
           smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
        else if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL &&
            brefs && h->param.i_frame_reference <= (brefs+3) )
        {
            frm->i_type = X264_TYPE_B;
            x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s and %d reference frames\n",
                      frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid], h->param.i_frame_reference );
        }

        if( frm->i_type == X264_TYPE_KEYFRAME )
            frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR;

        /* Limit GOP size */
        if( (!h->param.b_intra_refresh || frm->i_frame == 0) && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_max )
        {
            if( frm->i_type == X264_TYPE_AUTO || frm->i_type == X264_TYPE_I )
                frm->i_type = h->param.b_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR;
            int warn = frm->i_type != X264_TYPE_IDR;
            if( warn && h->param.b_open_gop )
                warn &= frm->i_type != X264_TYPE_I;
            if( warn )
                x264_log( h, X264_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", frm->i_type, frm->i_frame );
        }
        if( frm->i_type == X264_TYPE_I && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_min )
        {
            if( h->param.b_open_gop )
            {
                h->lookahead->i_last_keyframe = frm->i_frame; // Use display order
                if( h->param.b_bluray_compat )
                    h->lookahead->i_last_keyframe -= bframes; // Use bluray order
                frm->b_keyframe = 1;
            }
            else
                frm->i_type = X264_TYPE_IDR;
        }
        if( frm->i_type == X264_TYPE_IDR )
        {
            /* Close GOP */
            h->lookahead->i_last_keyframe = frm->i_frame;
            frm->b_keyframe = 1;
            if( bframes > 0 )
            {
                bframes--;
                h->lookahead->next.list[bframes]->i_type = X264_TYPE_P;
            }
        }

        if( bframes == h->param.i_bframe ||
            !h->lookahead->next.list[bframes+1] )
        {
            if( IS_X264_TYPE_B( frm->i_type ) )
                x264_log( h, X264_LOG_WARNING, "specified frame type is not compatible with max B-frames\n" );
            if( frm->i_type == X264_TYPE_AUTO
                || IS_X264_TYPE_B( frm->i_type ) )
                frm->i_type = X264_TYPE_P;
        }

        if( frm->i_type == X264_TYPE_BREF )
            brefs++;

        if( frm->i_type == X264_TYPE_AUTO )
            frm->i_type = X264_TYPE_B;

        else if( !IS_X264_TYPE_B( frm->i_type ) ) break;
    }

    if( bframes )
        h->lookahead->next.list[bframes-1]->b_last_minigop_bframe = 1;
    h->lookahead->next.list[bframes]->i_bframes = bframes;

    /* insert a bref into the sequence */
    if( h->param.i_bframe_pyramid && bframes > 1 && !brefs )
    {
        h->lookahead->next.list[bframes/2]->i_type = X264_TYPE_BREF;
        brefs++;
    }

    /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */
    if( h->param.rc.i_rc_method != X264_RC_CQP )
    {
        x264_mb_analysis_t a;
        int p0, p1, b;
        p1 = b = bframes + 1;

        x264_lowres_context_init( h, &a );

        frames[0] = h->lookahead->last_nonb;
        memcpy( &frames[1], h->lookahead->next.list, (bframes+1) * sizeof(x264_frame_t*) );
        if( IS_X264_TYPE_I( h->lookahead->next.list[bframes]->i_type ) )
            p0 = bframes + 1;
        else // P
            p0 = 0;

        x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );

        if( (p0 != p1 || bframes) && h->param.rc.i_vbv_buffer_size )
        {
            /* We need the intra costs for row SATDs. */
            x264_slicetype_frame_cost( h, &a, frames, b, b, b, 0 );

            /* We need B-frame costs for row SATDs. */
            p0 = 0;
            for( b = 1; b <= bframes; b++ )
            {
                if( frames[b]->i_type == X264_TYPE_B )
                    for( p1 = b; frames[p1]->i_type == X264_TYPE_B; )
                        p1++;
                else
                    p1 = bframes + 1;
                x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
                if( frames[b]->i_type == X264_TYPE_BREF )
                    p0 = b;
            }
        }
    }

    /* Analyse for weighted P frames */
    if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
        && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
    {
        x264_emms();
        x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 );
    }

    /* shift sequence to coded order.
       use a small temporary list to avoid shifting the entire next buffer around */
    int i_coded = h->lookahead->next.list[0]->i_frame;
    if( bframes )
    {
        int idx_list[] = { brefs+1, 1 };
        for( int i = 0; i < bframes; i++ )
        {
            int idx = idx_list[h->lookahead->next.list[i]->i_type == X264_TYPE_BREF]++;
            frames[idx] = h->lookahead->next.list[i];
            frames[idx]->i_reordered_pts = h->lookahead->next.list[idx]->i_pts;
        }
        frames[0] = h->lookahead->next.list[bframes];
        frames[0]->i_reordered_pts = h->lookahead->next.list[0]->i_pts;
        memcpy( h->lookahead->next.list, frames, (bframes+1) * sizeof(x264_frame_t*) );
    }

    for( int i = 0; i <= bframes; i++ )
    {
        h->lookahead->next.list[i]->i_coded = i_coded++;
        if( i )
        {
            x264_calculate_durations( h, h->lookahead->next.list[i], h->lookahead->next.list[i-1], &h->i_cpb_delay, &h->i_coded_fields );
            h->lookahead->next.list[0]->f_planned_cpb_duration[i-1] = (double)h->lookahead->next.list[i-1]->i_cpb_duration *
                                                                      h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
        }
        else
            x264_calculate_durations( h, h->lookahead->next.list[i], NULL, &h->i_cpb_delay, &h->i_coded_fields );

        h->lookahead->next.list[0]->f_planned_cpb_duration[i] = (double)h->lookahead->next.list[i]->i_cpb_duration *
                                                                h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
    }
}

int x264_rc_analyse_slice( x264_t *h )
{
    int p0 = 0, p1, b;
    int cost;
    x264_emms();

    if( IS_X264_TYPE_I(h->fenc->i_type) )
        p1 = b = 0;
    else if( h->fenc->i_type == X264_TYPE_P )
        p1 = b = h->fenc->i_bframes + 1;
    else //B
    {
        p1 = (h->fref_nearest[1]->i_poc - h->fref_nearest[0]->i_poc)/2;
        b  = (h->fenc->i_poc - h->fref_nearest[0]->i_poc)/2;
    }
    /* We don't need to assign p0/p1 since we are not performing any real analysis here. */
    x264_frame_t **frames = &h->fenc - b;

    /* cost should have been already calculated by x264_slicetype_decide */
    cost = frames[b]->i_cost_est[b-p0][p1-b];
    assert( cost >= 0 );

    if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
    {
        cost = x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
        if( b && h->param.rc.i_vbv_buffer_size )
            x264_slicetype_frame_cost_recalculate( h, frames, b, b, b );
    }
    /* In AQ, use the weighted score instead. */
    else if( h->param.rc.i_aq_mode )
        cost = frames[b]->i_cost_est_aq[b-p0][p1-b];

    h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
    h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
    h->fdec->i_satd = cost;
    memcpy( h->fdec->i_row_satd, h->fenc->i_row_satd, h->mb.i_mb_height * sizeof(int) );
    if( !IS_X264_TYPE_I(h->fenc->i_type) )
        memcpy( h->fdec->i_row_satds[0][0], h->fenc->i_row_satds[0][0], h->mb.i_mb_height * sizeof(int) );

    if( h->param.b_intra_refresh && h->param.rc.i_vbv_buffer_size && h->fenc->i_type == X264_TYPE_P )
    {
        int ip_factor = 256 * h->param.rc.f_ip_factor; /* fix8 */
        for( int y = 0; y < h->mb.i_mb_height; y++ )
        {
            int mb_xy = y * h->mb.i_mb_stride + h->fdec->i_pir_start_col;
            for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
            {
                int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
                int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK;
                int diff = intra_cost - inter_cost;
                if( h->param.rc.i_aq_mode )
                    h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
                else
                    h->fdec->i_row_satd[y] += diff;
                cost += diff;
            }
        }
    }

    if( BIT_DEPTH > 8 )
        for( int y = 0; y < h->mb.i_mb_height; y++ )
            h->fdec->i_row_satd[y] >>= (BIT_DEPTH - 8);

    return cost >> (BIT_DEPTH - 8);
}
x264-snapshot-20120103-2245-stable/encoder/set.h0000644000175000001440000000415511700673342020261 0ustar  videolanusers/*****************************************************************************
 * set.h: header writing
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ENCODER_SET_H
#define X264_ENCODER_SET_H

void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
void x264_sps_write( bs_t *s, x264_sps_t *sps );
void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps );
void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt );
int  x264_sei_version_write( x264_t *h, bs_t *s );
int  x264_validate_levels( x264_t *h, int verbose );
void x264_sei_buffering_period_write( x264_t *h, bs_t *s );
void x264_sei_pic_timing_write( x264_t *h, bs_t *s );
void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s );
void x264_sei_frame_packing_write( x264_t *h, bs_t *s );
void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type );
void x264_filler_write( x264_t *h, bs_t *s, int filler );

#endif
x264-snapshot-20120103-2245-stable/encoder/set.c0000644000175000001440000007456611700673342020271 0ustar  videolanusers/*****************************************************************************
 * set: header writing
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "set.h"

#define bs_write_ue bs_write_ue_big

// Indexed by pic_struct values
static const uint8_t num_clock_ts[10] = { 0, 1, 1, 1, 2, 2, 3, 3, 2, 3 };

static void transpose( uint8_t *buf, int w )
{
    for( int i = 0; i < w; i++ )
        for( int j = 0; j < i; j++ )
            XCHG( uint8_t, buf[w*i+j], buf[w*j+i] );
}

static void scaling_list_write( bs_t *s, x264_pps_t *pps, int idx )
{
    const int len = idx<4 ? 16 : 64;
    const uint8_t *zigzag = idx<4 ? x264_zigzag_scan4[0] : x264_zigzag_scan8[0];
    const uint8_t *list = pps->scaling_list[idx];
    const uint8_t *def_list = (idx==CQM_4IC) ? pps->scaling_list[CQM_4IY]
                            : (idx==CQM_4PC) ? pps->scaling_list[CQM_4PY]
                            : (idx==CQM_8IC+4) ? pps->scaling_list[CQM_8IY+4]
                            : (idx==CQM_8PC+4) ? pps->scaling_list[CQM_8PY+4]
                            : x264_cqm_jvt[idx];
    if( !memcmp( list, def_list, len ) )
        bs_write1( s, 0 );   // scaling_list_present_flag
    else if( !memcmp( list, x264_cqm_jvt[idx], len ) )
    {
        bs_write1( s, 1 );   // scaling_list_present_flag
        bs_write_se( s, -8 ); // use jvt list
    }
    else
    {
        int run;
        bs_write1( s, 1 );   // scaling_list_present_flag

        // try run-length compression of trailing values
        for( run = len; run > 1; run-- )
            if( list[zigzag[run-1]] != list[zigzag[run-2]] )
                break;
        if( run < len && len - run < bs_size_se( (int8_t)-list[zigzag[run]] ) )
            run = len;

        for( int j = 0; j < run; j++ )
            bs_write_se( s, (int8_t)(list[zigzag[j]] - (j>0 ? list[zigzag[j-1]] : 8)) ); // delta

        if( run < len )
            bs_write_se( s, (int8_t)-list[zigzag[run]] );
    }
}

void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type )
{
    int i;

    bs_realign( s );

    for( i = 0; i <= payload_type-255; i += 255 )
        bs_write( s, 8, 255 );
    bs_write( s, 8, payload_type-i );

    for( i = 0; i <= payload_size-255; i += 255 )
        bs_write( s, 8, 255 );
    bs_write( s, 8, payload_size-i );

    for( i = 0; i < payload_size; i++ )
        bs_write(s, 8, payload[i] );

    bs_rbsp_trailing( s );
    bs_flush( s );
}

void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
{
    int csp = param->i_csp & X264_CSP_MASK;

    sps->i_id = i_id;
    sps->i_mb_width = ( param->i_width + 15 ) / 16;
    sps->i_mb_height= ( param->i_height + 15 ) / 16;
    sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 :
                               csp >= X264_CSP_I422 ? CHROMA_422 : CHROMA_420;

    sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
    if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 )
        sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
    else if( sps->i_chroma_format_idc == CHROMA_422 )
        sps->i_profile_idc  = PROFILE_HIGH422;
    else if( BIT_DEPTH > 8 )
        sps->i_profile_idc  = PROFILE_HIGH10;
    else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
        sps->i_profile_idc  = PROFILE_HIGH;
    else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->b_fake_interlaced || param->analyse.i_weighted_pred > 0 )
        sps->i_profile_idc  = PROFILE_MAIN;
    else
        sps->i_profile_idc  = PROFILE_BASELINE;

    sps->b_constraint_set0  = sps->i_profile_idc == PROFILE_BASELINE;
    /* x264 doesn't support the features that are in Baseline and not in Main,
     * namely arbitrary_slice_order and slice_groups. */
    sps->b_constraint_set1  = sps->i_profile_idc <= PROFILE_MAIN;
    /* Never set constraint_set2, it is not necessary and not used in real world. */
    sps->b_constraint_set2  = 0;
    sps->b_constraint_set3  = 0;

    sps->i_level_idc = param->i_level_idc;
    if( param->i_level_idc == 9 && ( sps->i_profile_idc >= PROFILE_BASELINE && sps->i_profile_idc <= PROFILE_EXTENDED ) )
    {
        sps->b_constraint_set3 = 1; /* level 1b with Baseline, Main or Extended profile is signalled via constraint_set3 */
        sps->i_level_idc      = 11;
    }
    /* Intra profiles */
    if( param->i_keyint_max == 1 && sps->i_profile_idc > PROFILE_HIGH )
        sps->b_constraint_set3 = 1;

    sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
    /* extra slot with pyramid so that we don't have to override the
     * order of forgetting old pictures */
    sps->vui.i_max_dec_frame_buffering =
    sps->i_num_ref_frames = X264_MIN(X264_REF_MAX, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
                            param->i_bframe_pyramid ? 4 : 1, param->i_dpb_size));
    sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;
    if( param->i_keyint_max == 1 )
    {
        sps->i_num_ref_frames = 0;
        sps->vui.i_max_dec_frame_buffering = 0;
    }

    /* number of refs + current frame */
    int max_frame_num = sps->vui.i_max_dec_frame_buffering * (!!param->i_bframe_pyramid+1) + 1;
    /* Intra refresh cannot write a recovery time greater than max frame num-1 */
    if( param->b_intra_refresh )
    {
        int time_to_recovery = X264_MIN( sps->i_mb_width - 1, param->i_keyint_max ) + param->i_bframe - 1;
        max_frame_num = X264_MAX( max_frame_num, time_to_recovery+1 );
    }

    sps->i_log2_max_frame_num = 4;
    while( (1 << sps->i_log2_max_frame_num) <= max_frame_num )
        sps->i_log2_max_frame_num++;

    sps->i_poc_type = param->i_bframe || param->b_interlaced ? 0 : 2;
    if( sps->i_poc_type == 0 )
    {
        int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2;
        sps->i_log2_max_poc_lsb = 4;
        while( (1 << sps->i_log2_max_poc_lsb) <= max_delta_poc * 2 )
            sps->i_log2_max_poc_lsb++;
    }

    sps->b_vui = 1;

    sps->b_gaps_in_frame_num_value_allowed = 0;
    sps->b_frame_mbs_only = !(param->b_interlaced || param->b_fake_interlaced);
    if( !sps->b_frame_mbs_only )
        sps->i_mb_height = ( sps->i_mb_height + 1 ) & ~1;
    sps->b_mb_adaptive_frame_field = param->b_interlaced;
    sps->b_direct8x8_inference = 1;

    sps->crop.i_left   = param->crop_rect.i_left;
    sps->crop.i_top    = param->crop_rect.i_top;
    sps->crop.i_right  = param->crop_rect.i_right + sps->i_mb_width*16 - param->i_width;
    sps->crop.i_bottom = (param->crop_rect.i_bottom + sps->i_mb_height*16 - param->i_height) >> !sps->b_frame_mbs_only;
    sps->b_crop = sps->crop.i_left  || sps->crop.i_top ||
                  sps->crop.i_right || sps->crop.i_bottom;

    sps->vui.b_aspect_ratio_info_present = 0;
    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
    {
        sps->vui.b_aspect_ratio_info_present = 1;
        sps->vui.i_sar_width = param->vui.i_sar_width;
        sps->vui.i_sar_height= param->vui.i_sar_height;
    }

    sps->vui.b_overscan_info_present = param->vui.i_overscan > 0 && param->vui.i_overscan <= 2;
    if( sps->vui.b_overscan_info_present )
        sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 );

    sps->vui.b_signal_type_present = 0;
    sps->vui.i_vidformat = ( param->vui.i_vidformat >= 0 && param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 );
    sps->vui.b_fullrange = ( param->vui.b_fullrange >= 0 && param->vui.b_fullrange <= 1 ? param->vui.b_fullrange :
                           ( csp >= X264_CSP_BGR ? 1 : 0 ) );
    sps->vui.b_color_description_present = 0;

    sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <=  8 ? param->vui.i_colorprim : 2 );
    sps->vui.i_transfer  = ( param->vui.i_transfer  >= 0 && param->vui.i_transfer  <= 10 ? param->vui.i_transfer  : 2 );
    sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <=  8 ? param->vui.i_colmatrix :
                           ( csp >= X264_CSP_BGR ? 0 : 2 ) );
    if( sps->vui.i_colorprim != 2 ||
        sps->vui.i_transfer  != 2 ||
        sps->vui.i_colmatrix != 2 )
    {
        sps->vui.b_color_description_present = 1;
    }

    if( sps->vui.i_vidformat != 5 ||
        sps->vui.b_fullrange ||
        sps->vui.b_color_description_present )
    {
        sps->vui.b_signal_type_present = 1;
    }

    /* FIXME: not sufficient for interlaced video */
    sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5;
    if( sps->vui.b_chroma_loc_info_present )
    {
        sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc;
        sps->vui.i_chroma_loc_bottom = param->vui.i_chroma_loc;
    }

    sps->vui.b_timing_info_present = param->i_timebase_num > 0 && param->i_timebase_den > 0;

    if( sps->vui.b_timing_info_present )
    {
        sps->vui.i_num_units_in_tick = param->i_timebase_num;
        sps->vui.i_time_scale = param->i_timebase_den * 2;
        sps->vui.b_fixed_frame_rate = !param->b_vfr_input;
    }

    sps->vui.b_vcl_hrd_parameters_present = 0; // we don't support VCL HRD
    sps->vui.b_nal_hrd_parameters_present = !!param->i_nal_hrd;
    sps->vui.b_pic_struct_present = param->b_pic_struct;

    // NOTE: HRD related parts of the SPS are initialised in x264_ratecontrol_init_reconfigurable

    sps->vui.b_bitstream_restriction = 1;
    if( sps->vui.b_bitstream_restriction )
    {
        sps->vui.b_motion_vectors_over_pic_boundaries = 1;
        sps->vui.i_max_bytes_per_pic_denom = 0;
        sps->vui.i_max_bits_per_mb_denom = 0;
        sps->vui.i_log2_max_mv_length_horizontal =
        sps->vui.i_log2_max_mv_length_vertical = (int)log2f( X264_MAX( 1, param->analyse.i_mv_range*4-1 ) ) + 1;
    }
}

void x264_sps_write( bs_t *s, x264_sps_t *sps )
{
    bs_realign( s );
    bs_write( s, 8, sps->i_profile_idc );
    bs_write1( s, sps->b_constraint_set0 );
    bs_write1( s, sps->b_constraint_set1 );
    bs_write1( s, sps->b_constraint_set2 );
    bs_write1( s, sps->b_constraint_set3 );

    bs_write( s, 4, 0 );    /* reserved */

    bs_write( s, 8, sps->i_level_idc );

    bs_write_ue( s, sps->i_id );

    if( sps->i_profile_idc >= PROFILE_HIGH )
    {
        bs_write_ue( s, sps->i_chroma_format_idc );
        if( sps->i_chroma_format_idc == CHROMA_444 )
            bs_write1( s, 0 ); // separate_colour_plane_flag
        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8
        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8
        bs_write1( s, sps->b_qpprime_y_zero_transform_bypass );
        bs_write1( s, 0 ); // seq_scaling_matrix_present_flag
    }

    bs_write_ue( s, sps->i_log2_max_frame_num - 4 );
    bs_write_ue( s, sps->i_poc_type );
    if( sps->i_poc_type == 0 )
        bs_write_ue( s, sps->i_log2_max_poc_lsb - 4 );
    bs_write_ue( s, sps->i_num_ref_frames );
    bs_write1( s, sps->b_gaps_in_frame_num_value_allowed );
    bs_write_ue( s, sps->i_mb_width - 1 );
    bs_write_ue( s, (sps->i_mb_height >> !sps->b_frame_mbs_only) - 1);
    bs_write1( s, sps->b_frame_mbs_only );
    if( !sps->b_frame_mbs_only )
        bs_write1( s, sps->b_mb_adaptive_frame_field );
    bs_write1( s, sps->b_direct8x8_inference );

    bs_write1( s, sps->b_crop );
    if( sps->b_crop )
    {
        int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422;
        int v_shift = sps->i_chroma_format_idc == CHROMA_420;
        bs_write_ue( s, sps->crop.i_left   >> h_shift );
        bs_write_ue( s, sps->crop.i_right  >> h_shift );
        bs_write_ue( s, sps->crop.i_top    >> v_shift );
        bs_write_ue( s, sps->crop.i_bottom >> v_shift );
    }

    bs_write1( s, sps->b_vui );
    if( sps->b_vui )
    {
        bs_write1( s, sps->vui.b_aspect_ratio_info_present );
        if( sps->vui.b_aspect_ratio_info_present )
        {
            int i;
            static const struct { uint8_t w, h, sar; } sar[] =
            {
                // aspect_ratio_idc = 0 -> unspecified
                {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
                { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
                { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
                {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
                // aspect_ratio_idc = [17..254] -> reserved
                { 0, 0, 255 }
            };
            for( i = 0; sar[i].sar != 255; i++ )
            {
                if( sar[i].w == sps->vui.i_sar_width &&
                    sar[i].h == sps->vui.i_sar_height )
                    break;
            }
            bs_write( s, 8, sar[i].sar );
            if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */
            {
                bs_write( s, 16, sps->vui.i_sar_width );
                bs_write( s, 16, sps->vui.i_sar_height );
            }
        }

        bs_write1( s, sps->vui.b_overscan_info_present );
        if( sps->vui.b_overscan_info_present )
            bs_write1( s, sps->vui.b_overscan_info );

        bs_write1( s, sps->vui.b_signal_type_present );
        if( sps->vui.b_signal_type_present )
        {
            bs_write( s, 3, sps->vui.i_vidformat );
            bs_write1( s, sps->vui.b_fullrange );
            bs_write1( s, sps->vui.b_color_description_present );
            if( sps->vui.b_color_description_present )
            {
                bs_write( s, 8, sps->vui.i_colorprim );
                bs_write( s, 8, sps->vui.i_transfer );
                bs_write( s, 8, sps->vui.i_colmatrix );
            }
        }

        bs_write1( s, sps->vui.b_chroma_loc_info_present );
        if( sps->vui.b_chroma_loc_info_present )
        {
            bs_write_ue( s, sps->vui.i_chroma_loc_top );
            bs_write_ue( s, sps->vui.i_chroma_loc_bottom );
        }

        bs_write1( s, sps->vui.b_timing_info_present );
        if( sps->vui.b_timing_info_present )
        {
            bs_write32( s, sps->vui.i_num_units_in_tick );
            bs_write32( s, sps->vui.i_time_scale );
            bs_write1( s, sps->vui.b_fixed_frame_rate );
        }

        bs_write1( s, sps->vui.b_nal_hrd_parameters_present );
        if( sps->vui.b_nal_hrd_parameters_present )
        {
            bs_write_ue( s, sps->vui.hrd.i_cpb_cnt - 1 );
            bs_write( s, 4, sps->vui.hrd.i_bit_rate_scale );
            bs_write( s, 4, sps->vui.hrd.i_cpb_size_scale );

            bs_write_ue( s, sps->vui.hrd.i_bit_rate_value - 1 );
            bs_write_ue( s, sps->vui.hrd.i_cpb_size_value - 1 );

            bs_write1( s, sps->vui.hrd.b_cbr_hrd );

            bs_write( s, 5, sps->vui.hrd.i_initial_cpb_removal_delay_length - 1 );
            bs_write( s, 5, sps->vui.hrd.i_cpb_removal_delay_length - 1 );
            bs_write( s, 5, sps->vui.hrd.i_dpb_output_delay_length - 1 );
            bs_write( s, 5, sps->vui.hrd.i_time_offset_length );
        }

        bs_write1( s, sps->vui.b_vcl_hrd_parameters_present );

        if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present )
            bs_write1( s, 0 );   /* low_delay_hrd_flag */

        bs_write1( s, sps->vui.b_pic_struct_present );
        bs_write1( s, sps->vui.b_bitstream_restriction );
        if( sps->vui.b_bitstream_restriction )
        {
            bs_write1( s, sps->vui.b_motion_vectors_over_pic_boundaries );
            bs_write_ue( s, sps->vui.i_max_bytes_per_pic_denom );
            bs_write_ue( s, sps->vui.i_max_bits_per_mb_denom );
            bs_write_ue( s, sps->vui.i_log2_max_mv_length_horizontal );
            bs_write_ue( s, sps->vui.i_log2_max_mv_length_vertical );
            bs_write_ue( s, sps->vui.i_num_reorder_frames );
            bs_write_ue( s, sps->vui.i_max_dec_frame_buffering );
        }
    }

    bs_rbsp_trailing( s );
    bs_flush( s );
}

void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
{
    pps->i_id = i_id;
    pps->i_sps_id = sps->i_id;
    pps->b_cabac = param->b_cabac;

    pps->b_pic_order = param->b_interlaced;
    pps->i_num_slice_groups = 1;

    pps->i_num_ref_idx_l0_default_active = param->i_frame_reference;
    pps->i_num_ref_idx_l1_default_active = 1;

    pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
    pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;

    pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant );
    pps->i_pic_init_qs = 26 + QP_BD_OFFSET;

    pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;
    pps->b_deblocking_filter_control = 1;
    pps->b_constrained_intra_pred = param->b_constrained_intra;
    pps->b_redundant_pic_cnt = 0;

    pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0;

    pps->i_cqm_preset = param->i_cqm_preset;

    switch( pps->i_cqm_preset )
    {
    case X264_CQM_FLAT:
        for( int i = 0; i < 8; i++ )
            pps->scaling_list[i] = x264_cqm_flat16;
        break;
    case X264_CQM_JVT:
        for( int i = 0; i < 8; i++ )
            pps->scaling_list[i] = x264_cqm_jvt[i];
        break;
    case X264_CQM_CUSTOM:
        /* match the transposed DCT & zigzag */
        transpose( param->cqm_4iy, 4 );
        transpose( param->cqm_4py, 4 );
        transpose( param->cqm_4ic, 4 );
        transpose( param->cqm_4pc, 4 );
        transpose( param->cqm_8iy, 8 );
        transpose( param->cqm_8py, 8 );
        transpose( param->cqm_8ic, 8 );
        transpose( param->cqm_8pc, 8 );
        pps->scaling_list[CQM_4IY] = param->cqm_4iy;
        pps->scaling_list[CQM_4PY] = param->cqm_4py;
        pps->scaling_list[CQM_4IC] = param->cqm_4ic;
        pps->scaling_list[CQM_4PC] = param->cqm_4pc;
        pps->scaling_list[CQM_8IY+4] = param->cqm_8iy;
        pps->scaling_list[CQM_8PY+4] = param->cqm_8py;
        pps->scaling_list[CQM_8IC+4] = param->cqm_8ic;
        pps->scaling_list[CQM_8PC+4] = param->cqm_8pc;
        for( int i = 0; i < 8; i++ )
            for( int j = 0; j < (i < 4 ? 16 : 64); j++ )
                if( pps->scaling_list[i][j] == 0 )
                    pps->scaling_list[i] = x264_cqm_jvt[i];
        break;
    }
}

void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps )
{
    bs_realign( s );
    bs_write_ue( s, pps->i_id );
    bs_write_ue( s, pps->i_sps_id );

    bs_write1( s, pps->b_cabac );
    bs_write1( s, pps->b_pic_order );
    bs_write_ue( s, pps->i_num_slice_groups - 1 );

    bs_write_ue( s, pps->i_num_ref_idx_l0_default_active - 1 );
    bs_write_ue( s, pps->i_num_ref_idx_l1_default_active - 1 );
    bs_write1( s, pps->b_weighted_pred );
    bs_write( s, 2, pps->b_weighted_bipred );

    bs_write_se( s, pps->i_pic_init_qp - 26 - QP_BD_OFFSET );
    bs_write_se( s, pps->i_pic_init_qs - 26 - QP_BD_OFFSET );
    bs_write_se( s, pps->i_chroma_qp_index_offset );

    bs_write1( s, pps->b_deblocking_filter_control );
    bs_write1( s, pps->b_constrained_intra_pred );
    bs_write1( s, pps->b_redundant_pic_cnt );

    if( pps->b_transform_8x8_mode || pps->i_cqm_preset != X264_CQM_FLAT )
    {
        bs_write1( s, pps->b_transform_8x8_mode );
        bs_write1( s, (pps->i_cqm_preset != X264_CQM_FLAT) );
        if( pps->i_cqm_preset != X264_CQM_FLAT )
        {
            scaling_list_write( s, pps, CQM_4IY );
            scaling_list_write( s, pps, CQM_4IC );
            bs_write1( s, 0 ); // Cr = Cb
            scaling_list_write( s, pps, CQM_4PY );
            scaling_list_write( s, pps, CQM_4PC );
            bs_write1( s, 0 ); // Cr = Cb
            if( pps->b_transform_8x8_mode )
            {
                if( sps->i_chroma_format_idc == CHROMA_444 )
                {
                    scaling_list_write( s, pps, CQM_8IY+4 );
                    scaling_list_write( s, pps, CQM_8IC+4 );
                    bs_write1( s, 0 ); // Cr = Cb
                    scaling_list_write( s, pps, CQM_8PY+4 );
                    scaling_list_write( s, pps, CQM_8PC+4 );
                    bs_write1( s, 0 ); // Cr = Cb
                }
                else
                {
                    scaling_list_write( s, pps, CQM_8IY+4 );
                    scaling_list_write( s, pps, CQM_8PY+4 );
                }
            }
        }
        bs_write_se( s, pps->i_chroma_qp_index_offset );
    }

    bs_rbsp_trailing( s );
    bs_flush( s );
}

void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt )
{
    bs_t q;
    uint8_t tmp_buf[100];
    bs_init( &q, tmp_buf, 100 );

    bs_realign( &q );

    bs_write_ue( &q, recovery_frame_cnt ); // recovery_frame_cnt
    bs_write1( &q, 1 );   //exact_match_flag 1
    bs_write1( &q, 0 );   //broken_link_flag 0
    bs_write( &q, 2, 0 ); //changing_slice_group 0

    bs_align_10( &q );
    bs_flush( &q );

    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_RECOVERY_POINT );
}

int x264_sei_version_write( x264_t *h, bs_t *s )
{
    // random ID number generated according to ISO-11578
    static const uint8_t uuid[16] =
    {
        0xdc, 0x45, 0xe9, 0xbd, 0xe6, 0xd9, 0x48, 0xb7,
        0x96, 0x2c, 0xd8, 0x20, 0xd9, 0x23, 0xee, 0xef
    };
    char *opts = x264_param2string( &h->param, 0 );
    char *payload;
    int length;

    if( !opts )
        return -1;
    CHECKED_MALLOC( payload, 200 + strlen( opts ) );

    memcpy( payload, uuid, 16 );
    sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
             "Copy%s 2003-2011 - http://www.videolan.org/x264.html - options: %s",
             X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts );
    length = strlen(payload)+1;

    x264_sei_write( s, (uint8_t *)payload, length, SEI_USER_DATA_UNREGISTERED );

    x264_free( opts );
    x264_free( payload );
    return 0;
fail:
    x264_free( opts );
    return -1;
}

void x264_sei_buffering_period_write( x264_t *h, bs_t *s )
{
    x264_sps_t *sps = h->sps;
    bs_t q;
    uint8_t tmp_buf[100];
    bs_init( &q, tmp_buf, 100 );

    bs_realign( &q );
    bs_write_ue( &q, sps->i_id );

    if( sps->vui.b_nal_hrd_parameters_present )
    {
        bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay );
        bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay_offset );
    }

    bs_align_10( &q );
    bs_flush( &q );

    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_BUFFERING_PERIOD );
}

void x264_sei_pic_timing_write( x264_t *h, bs_t *s )
{
    x264_sps_t *sps = h->sps;
    bs_t q;
    uint8_t tmp_buf[100];
    bs_init( &q, tmp_buf, 100 );

    bs_realign( &q );

    if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present )
    {
        bs_write( &q, sps->vui.hrd.i_cpb_removal_delay_length, h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset );
        bs_write( &q, sps->vui.hrd.i_dpb_output_delay_length, h->fenc->i_dpb_output_delay );
    }

    if( sps->vui.b_pic_struct_present )
    {
        bs_write( &q, 4, h->fenc->i_pic_struct-1 ); // We use index 0 for "Auto"

        // These clock timestamps are not standardised so we don't set them
        // They could be time of origin, capture or alternative ideal display
        for( int i = 0; i < num_clock_ts[h->fenc->i_pic_struct]; i++ )
            bs_write1( &q, 0 ); // clock_timestamp_flag
    }

    bs_align_10( &q );
    bs_flush( &q );

    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_PIC_TIMING );
}

void x264_sei_frame_packing_write( x264_t *h, bs_t *s )
{
    int quincunx_sampling_flag = h->param.i_frame_packing == 0;
    bs_t q;
    uint8_t tmp_buf[100];
    bs_init( &q, tmp_buf, 100 );

    bs_realign( &q );

    bs_write_ue( &q, 0 );                         // frame_packing_arrangement_id
    bs_write1( &q, 0 );                           // frame_packing_arrangement_cancel_flag
    bs_write ( &q, 7, h->param.i_frame_packing ); // frame_packing_arrangement_type
    bs_write1( &q, quincunx_sampling_flag );      // quincunx_sampling_flag

    // 0: views are unrelated, 1: left view is on the left, 2: left view is on the right
    bs_write ( &q, 6, 1 );                        // content_interpretation_type

    bs_write1( &q, 0 );                           // spatial_flipping_flag
    bs_write1( &q, 0 );                           // frame0_flipped_flag
    bs_write1( &q, 0 );                           // field_views_flag
    bs_write1( &q, h->param.i_frame_packing == 5 && !(h->fenc->i_frame&1) ); // current_frame_is_frame0_flag
    bs_write1( &q, 0 );                           // frame0_self_contained_flag
    bs_write1( &q, 0 );                           // frame1_self_contained_flag
    if ( quincunx_sampling_flag == 0 && h->param.i_frame_packing != 5 )
    {
        bs_write( &q, 4, 0 );                     // frame0_grid_position_x
        bs_write( &q, 4, 0 );                     // frame0_grid_position_y
        bs_write( &q, 4, 0 );                     // frame1_grid_position_x
        bs_write( &q, 4, 0 );                     // frame1_grid_position_y
    }
    bs_write( &q, 8, 0 );                         // frame_packing_arrangement_reserved_byte
    bs_write_ue( &q, 1 );                         // frame_packing_arrangement_repetition_period
    bs_write1( &q, 0 );                           // frame_packing_arrangement_extension_flag

    bs_align_10( &q );
    bs_flush( &q );

    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_FRAME_PACKING );
}

void x264_filler_write( x264_t *h, bs_t *s, int filler )
{
    bs_realign( s );

    for( int i = 0; i < filler; i++ )
        bs_write( s, 8, 0xff );

    bs_rbsp_trailing( s );
    bs_flush( s );
}

void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s )
{
    x264_slice_header_t *sh = &h->sh_backup;
    bs_t q;
    uint8_t tmp_buf[100];
    bs_init( &q, tmp_buf, 100 );

    bs_realign( &q );

    /* We currently only use this for repeating B-refs, as required by Blu-ray. */
    bs_write1( &q, 0 );                 //original_idr_flag
    bs_write_ue( &q, sh->i_frame_num ); //original_frame_num
    if( !h->sps->b_frame_mbs_only )
        bs_write1( &q, 0 );             //original_field_pic_flag

    bs_write1( &q, sh->i_mmco_command_count > 0 );
    if( sh->i_mmco_command_count > 0 )
    {
        for( int i = 0; i < sh->i_mmco_command_count; i++ )
        {
            bs_write_ue( &q, 1 );
            bs_write_ue( &q, sh->mmco[i].i_difference_of_pic_nums - 1 );
        }
        bs_write_ue( &q, 0 );
    }

    bs_align_10( &q );
    bs_flush( &q );

    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_DEC_REF_PIC_MARKING );
}

const x264_level_t x264_levels[] =
{
    { 10,   1485,    99,   152064,     64,    175,  64, 64,  0, 2, 0, 0, 1 },
    {  9,   1485,    99,   152064,    128,    350,  64, 64,  0, 2, 0, 0, 1 }, /* "1b" */
    { 11,   3000,   396,   345600,    192,    500, 128, 64,  0, 2, 0, 0, 1 },
    { 12,   6000,   396,   912384,    384,   1000, 128, 64,  0, 2, 0, 0, 1 },
    { 13,  11880,   396,   912384,    768,   2000, 128, 64,  0, 2, 0, 0, 1 },
    { 20,  11880,   396,   912384,   2000,   2000, 128, 64,  0, 2, 0, 0, 1 },
    { 21,  19800,   792,  1824768,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
    { 22,  20250,  1620,  3110400,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
    { 30,  40500,  1620,  3110400,  10000,  10000, 256, 32, 22, 2, 0, 1, 0 },
    { 31, 108000,  3600,  6912000,  14000,  14000, 512, 16, 60, 4, 1, 1, 0 },
    { 32, 216000,  5120,  7864320,  20000,  20000, 512, 16, 60, 4, 1, 1, 0 },
    { 40, 245760,  8192, 12582912,  20000,  25000, 512, 16, 60, 4, 1, 1, 0 },
    { 41, 245760,  8192, 12582912,  50000,  62500, 512, 16, 24, 2, 1, 1, 0 },
    { 42, 522240,  8704, 13369344,  50000,  62500, 512, 16, 24, 2, 1, 1, 1 },
    { 50, 589824, 22080, 42393600, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
    { 51, 983040, 36864, 70778880, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
    { 0 }
};

#define ERROR(...)\
{\
    if( verbose )\
        x264_log( h, X264_LOG_WARNING, __VA_ARGS__ );\
    ret = 1;\
}

int x264_validate_levels( x264_t *h, int verbose )
{
    int ret = 0;
    int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
    int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
    int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 :
                     h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
                     h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;

    const x264_level_t *l = x264_levels;
    while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
        l++;

    if( l->frame_size < mbs
        || l->frame_size*8 < h->sps->i_mb_width * h->sps->i_mb_width
        || l->frame_size*8 < h->sps->i_mb_height * h->sps->i_mb_height )
        ERROR( "frame MB size (%dx%d) > level limit (%d)\n",
               h->sps->i_mb_width, h->sps->i_mb_height, l->frame_size );
    if( dpb > l->dpb )
        ERROR( "DPB size (%d frames, %d bytes) > level limit (%d frames, %d bytes)\n",
                h->sps->vui.i_max_dec_frame_buffering, dpb, (int)(l->dpb / (384*mbs)), l->dpb );

#define CHECK( name, limit, val ) \
    if( (val) > (limit) ) \
        ERROR( name " (%d) > level limit (%d)\n", (int)(val), (limit) );

    CHECK( "VBV bitrate", (l->bitrate * cbp_factor) / 4, h->param.rc.i_vbv_max_bitrate );
    CHECK( "VBV buffer", (l->cpb * cbp_factor) / 4, h->param.rc.i_vbv_buffer_size );
    CHECK( "MV range", l->mv_range, h->param.analyse.i_mv_range );
    CHECK( "interlaced", !l->frame_only, h->param.b_interlaced );
    CHECK( "fake interlaced", !l->frame_only, h->param.b_fake_interlaced );

    if( h->param.i_fps_den > 0 )
        CHECK( "MB rate", l->mbps, (int64_t)mbs * h->param.i_fps_num / h->param.i_fps_den );

    /* TODO check the rest of the limits */
    return ret;
}
x264-snapshot-20120103-2245-stable/encoder/ratecontrol.h0000644000175000001440000000621611700673342022022 0ustar  videolanusers/*****************************************************************************
 * ratecontrol.h: ratecontrol
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_RATECONTROL_H
#define X264_RATECONTROL_H

/* Completely arbitrary.  Ratecontrol lowers relative quality at higher framerates
 * and the reverse at lower framerates; this serves as the center of the curve.
 * Halve all the values for frame-packed 3D to compensate for the "doubled"
 * framerate. */
#define BASE_FRAME_DURATION (0.04f / ((h->param.i_frame_packing == 5)+1))

/* Arbitrary limitations as a sanity check. */
#define MAX_FRAME_DURATION (1.00f / ((h->param.i_frame_packing == 5)+1))
#define MIN_FRAME_DURATION (0.01f / ((h->param.i_frame_packing == 5)+1))

#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION)

int  x264_ratecontrol_new   ( x264_t * );
void x264_ratecontrol_delete( x264_t * );

void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );

void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
int  x264_reference_build_list_optimal( x264_t *h );
void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
int  x264_ratecontrol_slice_type( x264_t *, int i_frame );
void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm );
void x264_ratecontrol_mb( x264_t *, int bits );
int  x264_ratecontrol_qp( x264_t * );
int  x264_ratecontrol_mb_qp( x264_t *h );
int  x264_ratecontrol_end( x264_t *, int bits, int *filler );
void x264_ratecontrol_summary( x264_t * );
void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
int  x264_ratecontrol_get_estimated_size( x264_t const *);
int  x264_rc_analyse_slice( x264_t *h );
int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
void x264_threads_distribute_ratecontrol( x264_t *h );
void x264_threads_merge_ratecontrol( x264_t *h );
void x264_hrd_fullness( x264_t *h );
#endif

x264-snapshot-20120103-2245-stable/encoder/ratecontrol.c0000644000175000001440000033452211700673342022021 0ustar  videolanusers/*****************************************************************************
 * ratecontrol.c: ratecontrol
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Michael Niedermayer <michaelni@gmx.at>
 *          Gabriel Bouvigne <gabriel.bouvigne@joost.com>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          M�ns Rullg�rd <mru@mru.ath.cx>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#define _ISOC99_SOURCE
#undef NDEBUG // always check asserts, the speed effect is far too small to disable them

#include "common/common.h"
#include "ratecontrol.h"
#include "me.h"

typedef struct
{
    int pict_type;
    int frame_type;
    int kept_as_ref;
    double qscale;
    int mv_bits;
    int tex_bits;
    int misc_bits;
    uint64_t expected_bits; /*total expected bits up to the current frame (current one excluded)*/
    double expected_vbv;
    double new_qscale;
    int new_qp;
    int i_count;
    int p_count;
    int s_count;
    float blurred_complexity;
    char direct_mode;
    int16_t weight[3][2];
    int16_t i_weight_denom[2];
    int refcount[16];
    int refs;
    int64_t i_duration;
    int64_t i_cpb_duration;
} ratecontrol_entry_t;

typedef struct
{
    float coeff;
    float count;
    float decay;
    float offset;
} predictor_t;

struct x264_ratecontrol_t
{
    /* constants */
    int b_abr;
    int b_2pass;
    int b_vbv;
    int b_vbv_min_rate;
    double fps;
    double bitrate;
    double rate_tolerance;
    double qcompress;
    int nmb;                    /* number of macroblocks in a frame */
    int qp_constant[3];

    /* current frame */
    ratecontrol_entry_t *rce;
    int qp;                     /* qp for current frame */
    float qpm;                  /* qp for current macroblock: precise float for AQ */
    float qpa_rc;               /* average of macroblocks' qp before aq */
    int   qpa_aq;               /* average of macroblocks' qp after aq */
    float qp_novbv;             /* QP for the current frame if 1-pass VBV was disabled. */

    /* VBV stuff */
    double buffer_size;
    int64_t buffer_fill_final;
    double buffer_fill;         /* planned buffer, if all in-progress frames hit their bit budget */
    double buffer_rate;         /* # of bits added to buffer_fill after each frame */
    double vbv_max_rate;        /* # of bits added to buffer_fill per second */
    predictor_t *pred;          /* predict frame size from satd */
    int single_frame_vbv;
    double rate_factor_max_increment; /* Don't allow RF above (CRF + this value). */

    /* ABR stuff */
    int    last_satd;
    double last_rceq;
    double cplxr_sum;           /* sum of bits*qscale/rceq */
    double expected_bits_sum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
    int64_t filler_bits_sum;    /* sum in bits of finished frames' filler data */
    double wanted_bits_window;  /* target bitrate * window */
    double cbr_decay;
    double short_term_cplxsum;
    double short_term_cplxcount;
    double rate_factor_constant;
    double ip_offset;
    double pb_offset;

    /* 2pass stuff */
    FILE *p_stat_file_out;
    char *psz_stat_file_tmpname;
    FILE *p_mbtree_stat_file_out;
    char *psz_mbtree_stat_file_tmpname;
    char *psz_mbtree_stat_file_name;
    FILE *p_mbtree_stat_file_in;

    int num_entries;            /* number of ratecontrol_entry_ts */
    ratecontrol_entry_t *entry; /* FIXME: copy needed data and free this once init is done */
    double last_qscale;
    double last_qscale_for[3];  /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */
    int last_non_b_pict_type;
    double accum_p_qp;          /* for determining I-frame quant */
    double accum_p_norm;
    double last_accum_p_norm;
    double lmin[3];             /* min qscale by frame type */
    double lmax[3];
    double lstep;               /* max change (multiply) in qscale per frame */
    uint16_t *qp_buffer[2];     /* Global buffers for converting MB-tree quantizer data. */
    int qpbuf_pos;              /* In order to handle pyramid reordering, QP buffer acts as a stack.
                                 * This value is the current position (0 or 1). */

    /* MBRC stuff */
    float frame_size_estimated; /* Access to this variable must be atomic: double is
                                 * not atomic on all arches we care about */
    double frame_size_maximum;  /* Maximum frame size due to MinCR */
    double frame_size_planned;
    double slice_size_planned;
    predictor_t (*row_pred)[2];
    predictor_t row_preds[3][2];
    predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
    int bframes;                /* # consecutive B-frames before this P-frame */
    int bframe_bits;            /* total cost of those frames */

    int i_zones;
    x264_zone_t *zones;
    x264_zone_t *prev_zone;

    /* hrd stuff */
    int initial_cpb_removal_delay;
    int initial_cpb_removal_delay_offset;
    double nrt_first_access_unit; /* nominal removal time */
    double previous_cpb_final_arrival_time;
    uint64_t hrd_multiply_denom;
};


static int parse_zones( x264_t *h );
static int init_pass2(x264_t *);
static float rate_estimate_qscale( x264_t *h );
static int update_vbv( x264_t *h, int bits );
static void update_vbv_plan( x264_t *h, int overhead );
static float predict_size( predictor_t *p, float q, float var );
static void update_predictor( predictor_t *p, float q, float var, float bits );

#define CMP_OPT_FIRST_PASS( opt, param_val )\
{\
    if( ( p = strstr( opts, opt "=" ) ) && sscanf( p, opt "=%d" , &i ) && param_val != i )\
    {\
        x264_log( h, X264_LOG_ERROR, "different " opt " setting than first pass (%d vs %d)\n", param_val, i );\
        return -1;\
    }\
}

/* Terminology:
 * qp = h.264's quantizer
 * qscale = linearized quantizer = Lagrange multiplier
 */
static inline float qp2qscale( float qp )
{
    return 0.85f * powf( 2.0f, ( qp - 12.0f ) / 6.0f );
}
static inline float qscale2qp( float qscale )
{
    return 12.0f + 6.0f * log2f( qscale/0.85f );
}

/* Texture bitrate is not quite inversely proportional to qscale,
 * probably due the the changing number of SKIP blocks.
 * MV bits level off at about qp<=12, because the lambda used
 * for motion estimation is constant there. */
static inline double qscale2bits( ratecontrol_entry_t *rce, double qscale )
{
    if( qscale<0.1 )
        qscale = 0.1;
    return (rce->tex_bits + .1) * pow( rce->qscale / qscale, 1.1 )
           + rce->mv_bits * pow( X264_MAX(rce->qscale, 1) / X264_MAX(qscale, 1), 0.5 )
           + rce->misc_bits;
}

static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t *frame, int i, int b_store )
{
    uint32_t sum = sum_ssd;
    uint32_t ssd = sum_ssd >> 32;
    if( b_store )
    {
        frame->i_pixel_sum[i] += sum;
        frame->i_pixel_ssd[i] += ssd;
    }
    return ssd - ((uint64_t)sum * sum >> shift);
}

static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store )
{
    int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16;
    int stride = frame->i_stride[i];
    int offset = b_field
        ? 16 * mb_x + height * (mb_y&~1) * stride + (mb_y&1) * stride
        : 16 * mb_x + height * mb_y * stride;
    stride <<= b_field;
    if( b_chroma )
    {
        ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] );
        int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
        int shift = 7 - CHROMA_V_SHIFT;

        h->mc.load_deinterleave_chroma_fenc( pix, frame->plane[1] + offset, stride, height );
        return ac_energy_var( h->pixf.var[chromapix]( pix,               FENC_STRIDE ), shift, frame, 1, b_store )
             + ac_energy_var( h->pixf.var[chromapix]( pix+FENC_STRIDE/2, FENC_STRIDE ), shift, frame, 2, b_store );
    }
    else
        return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[i] + offset, stride ), 8, frame, i, b_store );
}

// Find the total AC energy of the block in all planes.
static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
{
    /* This function contains annoying hacks because GCC has a habit of reordering emms
     * and putting it after floating point ops.  As a result, we put the emms at the end of the
     * function and make sure that its always called before the float math.  Noinline makes
     * sure no reordering goes on. */
    uint32_t var;
    x264_prefetch_fenc( h, frame, mb_x, mb_y );
    if( h->mb.b_adaptive_mbaff )
    {
        /* We don't know the super-MB mode we're going to pick yet, so
         * simply try both and pick the lower of the two. */
        uint32_t var_interlaced, var_progressive;
        var_interlaced   = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 1, 1 );
        var_progressive  = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 0, 0 );
        if( CHROMA444 )
        {
            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 1, 1 );
            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 0, 0 );
            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, 1, 1 );
            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, 0, 0 );
        }
        else
        {
            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 1, 1 );
            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 0, 0 );
        }
        var = X264_MIN( var_interlaced, var_progressive );
    }
    else
    {
        var  = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, PARAM_INTERLACED, 1 );
        if( CHROMA444 )
        {
            var += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, PARAM_INTERLACED, 1 );
            var += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, PARAM_INTERLACED, 1 );
        }
        else
            var += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, PARAM_INTERLACED, 1 );
    }
    x264_emms();
    return var;
}

void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
{
    /* constants chosen to result in approximately the same overall bitrate as without AQ.
     * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
    float strength;
    float avg_adj = 0.f;
    /* Initialize frame stats */
    for( int i = 0; i < 3; i++ )
    {
        frame->i_pixel_sum[i] = 0;
        frame->i_pixel_ssd[i] = 0;
    }

    /* Degenerate cases */
    if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
    {
        /* Need to init it anyways for MB tree */
        if( h->param.rc.i_aq_mode && h->param.rc.f_aq_strength == 0 )
        {
            if( quant_offsets )
            {
                for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
                    frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
                if( h->frames.b_have_lowres )
                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
                        frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
            }
            else
            {
                memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
                memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
                if( h->frames.b_have_lowres )
                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
                        frame->i_inv_qscale_factor[mb_xy] = 256;
            }
        }
        /* Need variance data for weighted prediction */
        if( h->param.analyse.i_weighted_pred )
        {
            for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
                for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
                    x264_ac_energy_mb( h, mb_x, mb_y, frame );
        }
        else
            return;
    }
    /* Actual adaptive quantization */
    else
    {
        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
        {
            float bit_depth_correction = powf(1 << (BIT_DEPTH-8), 0.5f);
            float avg_adj_pow2 = 0.f;
            for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
                for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
                {
                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
                    float qp_adj = powf( energy + 1, 0.125f );
                    frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
                    avg_adj += qp_adj;
                    avg_adj_pow2 += qp_adj * qp_adj;
                }
            avg_adj /= h->mb.i_mb_count;
            avg_adj_pow2 /= h->mb.i_mb_count;
            strength = h->param.rc.f_aq_strength * avg_adj / bit_depth_correction;
            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (14.f * bit_depth_correction)) / avg_adj;
        }
        else
            strength = h->param.rc.f_aq_strength * 1.0397f;

        for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
            for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
            {
                float qp_adj;
                int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
                {
                    qp_adj = frame->f_qp_offset[mb_xy];
                    qp_adj = strength * (qp_adj - avg_adj);
                }
                else
                {
                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - (14.427f + 2*(BIT_DEPTH-8)));
                }
                if( quant_offsets )
                    qp_adj += quant_offsets[mb_xy];
                frame->f_qp_offset[mb_xy] =
                frame->f_qp_offset_aq[mb_xy] = qp_adj;
                if( h->frames.b_have_lowres )
                    frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
            }
    }

    /* Remove mean from SSD calculation */
    for( int i = 0; i < 3; i++ )
    {
        uint64_t ssd = frame->i_pixel_ssd[i];
        uint64_t sum = frame->i_pixel_sum[i];
        int width  = 16*h->mb.i_mb_width  >> (i && CHROMA_H_SHIFT);
        int height = 16*h->mb.i_mb_height >> (i && CHROMA_V_SHIFT);
        frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height);
    }
}

int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
{
    x264_ratecontrol_t *rc = h->rc;
    uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;

    if( rc->entry[frame->i_frame].kept_as_ref )
    {
        uint8_t i_type;
        if( rc->qpbuf_pos < 0 )
        {
            do
            {
                rc->qpbuf_pos++;

                if( !fread( &i_type, 1, 1, rc->p_mbtree_stat_file_in ) )
                    goto fail;
                if( fread( rc->qp_buffer[rc->qpbuf_pos], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_in ) != h->mb.i_mb_count )
                    goto fail;

                if( i_type != i_type_actual && rc->qpbuf_pos == 1 )
                {
                    x264_log( h, X264_LOG_ERROR, "MB-tree frametype %d doesn't match actual frametype %d.\n", i_type, i_type_actual );
                    return -1;
                }
            } while( i_type != i_type_actual );
        }

        for( int i = 0; i < h->mb.i_mb_count; i++ )
        {
            frame->f_qp_offset[i] = ((float)(int16_t)endian_fix16( rc->qp_buffer[rc->qpbuf_pos][i] )) * (1/256.0);
            if( h->frames.b_have_lowres )
                frame->i_inv_qscale_factor[i] = x264_exp2fix8(frame->f_qp_offset[i]);
        }
        rc->qpbuf_pos--;
    }
    else
        x264_stack_align( x264_adaptive_quant_frame, h, frame, quant_offsets );
    return 0;
fail:
    x264_log( h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n" );
    return -1;
}

int x264_reference_build_list_optimal( x264_t *h )
{
    ratecontrol_entry_t *rce = h->rc->rce;
    x264_frame_t *frames[16];
    x264_weight_t weights[16][3];
    int refcount[16];

    if( rce->refs != h->i_ref[0] )
        return -1;

    memcpy( frames, h->fref[0], sizeof(frames) );
    memcpy( refcount, rce->refcount, sizeof(refcount) );
    memcpy( weights, h->fenc->weight, sizeof(weights) );
    memset( &h->fenc->weight[1][0], 0, sizeof(x264_weight_t[15][3]) );

    /* For now don't reorder ref 0; it seems to lower quality
       in most cases due to skips. */
    for( int ref = 1; ref < h->i_ref[0]; ref++ )
    {
        int max = -1;
        int bestref = 1;

        for( int i = 1; i < h->i_ref[0]; i++ )
            /* Favor lower POC as a tiebreaker. */
            COPY2_IF_GT( max, refcount[i], bestref, i );

        /* FIXME: If there are duplicates from frames other than ref0 then it is possible
         * that the optimal ordering doesnt place every duplicate. */

        refcount[bestref] = -1;
        h->fref[0][ref] = frames[bestref];
        memcpy( h->fenc->weight[ref], weights[bestref], sizeof(weights[bestref]) );
    }

    return 0;
}

static char *x264_strcat_filename( char *input, char *suffix )
{
    char *output = x264_malloc( strlen( input ) + strlen( suffix ) + 1 );
    if( !output )
        return NULL;
    strcpy( output, input );
    strcat( output, suffix );
    return output;
}

void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init )
{
    x264_ratecontrol_t *rc = h->rc;
    if( !b_init && rc->b_2pass )
        return;

    if( h->param.rc.i_rc_method == X264_RC_CRF )
    {
        /* Arbitrary rescaling to make CRF somewhat similar to QP.
         * Try to compensate for MB-tree's effects as well. */
        double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
        double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
        rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
                                 / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset + QP_BD_OFFSET );
    }

    if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 )
    {
        /* We don't support changing the ABR bitrate right now,
           so if the stream starts as CBR, keep it CBR. */
        if( rc->b_vbv_min_rate )
            h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;

        if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
        {
            h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
            x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
                      h->param.rc.i_vbv_buffer_size );
        }

        int vbv_buffer_size = h->param.rc.i_vbv_buffer_size * 1000;
        int vbv_max_bitrate = h->param.rc.i_vbv_max_bitrate * 1000;

        /* Init HRD */
        if( h->param.i_nal_hrd && b_init )
        {
            h->sps->vui.hrd.i_cpb_cnt = 1;
            h->sps->vui.hrd.b_cbr_hrd = h->param.i_nal_hrd == X264_NAL_HRD_CBR;
            h->sps->vui.hrd.i_time_offset_length = 0;

            #define BR_SHIFT  6
            #define CPB_SHIFT 4

            int bitrate = 1000*h->param.rc.i_vbv_max_bitrate;
            int bufsize = 1000*h->param.rc.i_vbv_buffer_size;

            // normalize HRD size and rate to the value / scale notation
            h->sps->vui.hrd.i_bit_rate_scale = x264_clip3( x264_ctz( bitrate ) - BR_SHIFT, 0, 15 );
            h->sps->vui.hrd.i_bit_rate_value = bitrate >> ( h->sps->vui.hrd.i_bit_rate_scale + BR_SHIFT );
            h->sps->vui.hrd.i_bit_rate_unscaled = h->sps->vui.hrd.i_bit_rate_value << ( h->sps->vui.hrd.i_bit_rate_scale + BR_SHIFT );
            h->sps->vui.hrd.i_cpb_size_scale = x264_clip3( x264_ctz( bufsize ) - CPB_SHIFT, 0, 15 );
            h->sps->vui.hrd.i_cpb_size_value = bufsize >> ( h->sps->vui.hrd.i_cpb_size_scale + CPB_SHIFT );
            h->sps->vui.hrd.i_cpb_size_unscaled = h->sps->vui.hrd.i_cpb_size_value << ( h->sps->vui.hrd.i_cpb_size_scale + CPB_SHIFT );

            #undef CPB_SHIFT
            #undef BR_SHIFT

            // arbitrary
            #define MAX_DURATION 0.5

            int max_cpb_output_delay = X264_MIN( h->param.i_keyint_max * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick, INT_MAX );
            int max_dpb_output_delay = h->sps->vui.i_max_dec_frame_buffering * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick;
            int max_delay = (int)(90000.0 * (double)h->sps->vui.hrd.i_cpb_size_unscaled / h->sps->vui.hrd.i_bit_rate_unscaled + 0.5);

            h->sps->vui.hrd.i_initial_cpb_removal_delay_length = 2 + x264_clip3( 32 - x264_clz( max_delay ), 4, 22 );
            h->sps->vui.hrd.i_cpb_removal_delay_length = x264_clip3( 32 - x264_clz( max_cpb_output_delay ), 4, 31 );
            h->sps->vui.hrd.i_dpb_output_delay_length  = x264_clip3( 32 - x264_clz( max_dpb_output_delay ), 4, 31 );

            #undef MAX_DURATION

            vbv_buffer_size = h->sps->vui.hrd.i_cpb_size_unscaled;
            vbv_max_bitrate = h->sps->vui.hrd.i_bit_rate_unscaled;
        }
        else if( h->param.i_nal_hrd && !b_init )
        {
            x264_log( h, X264_LOG_WARNING, "VBV parameters cannot be changed when NAL HRD is in use\n" );
            return;
        }
        h->sps->vui.hrd.i_bit_rate_unscaled = vbv_max_bitrate;
        h->sps->vui.hrd.i_cpb_size_unscaled = vbv_buffer_size;

        if( rc->b_vbv_min_rate )
            rc->bitrate = h->param.rc.i_bitrate * 1000.;
        rc->buffer_rate = vbv_max_bitrate / rc->fps;
        rc->vbv_max_rate = vbv_max_bitrate;
        rc->buffer_size = vbv_buffer_size;
        rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
        rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
                      * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
        if( h->param.rc.i_rc_method == X264_RC_CRF && h->param.rc.f_rf_constant_max )
        {
            rc->rate_factor_max_increment = h->param.rc.f_rf_constant_max - h->param.rc.f_rf_constant;
            if( rc->rate_factor_max_increment <= 0 )
            {
                x264_log( h, X264_LOG_WARNING, "CRF max must be greater than CRF\n" );
                rc->rate_factor_max_increment = 0;
            }
        }
        if( b_init )
        {
            if( h->param.rc.f_vbv_buffer_init > 1. )
                h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
            h->param.rc.f_vbv_buffer_init = x264_clip3f( X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size ), 0, 1);
            rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale;
            rc->b_vbv = 1;
            rc->b_vbv_min_rate = !rc->b_2pass
                          && h->param.rc.i_rc_method == X264_RC_ABR
                          && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
        }
    }
}

int x264_ratecontrol_new( x264_t *h )
{
    x264_ratecontrol_t *rc;

    x264_emms();

    CHECKED_MALLOCZERO( h->rc, h->param.i_threads * sizeof(x264_ratecontrol_t) );
    rc = h->rc;

    rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read;
    rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read;

    /* FIXME: use integers */
    if( h->param.i_fps_num > 0 && h->param.i_fps_den > 0 )
        rc->fps = (float) h->param.i_fps_num / h->param.i_fps_den;
    else
        rc->fps = 25.0;

    if( h->param.rc.b_mb_tree )
    {
        h->param.rc.f_pb_factor = 1;
        rc->qcompress = 1;
    }
    else
        rc->qcompress = h->param.rc.f_qcompress;

    rc->bitrate = h->param.rc.i_bitrate * 1000.;
    rc->rate_tolerance = h->param.rc.f_rate_tolerance;
    rc->nmb = h->mb.i_mb_count;
    rc->last_non_b_pict_type = -1;
    rc->cbr_decay = 1.0;

    if( h->param.rc.i_rc_method == X264_RC_CRF && h->param.rc.b_stat_read )
    {
        x264_log( h, X264_LOG_ERROR, "constant rate-factor is incompatible with 2pass.\n" );
        return -1;
    }

    x264_ratecontrol_init_reconfigurable( h, 1 );

    if( h->param.i_nal_hrd )
    {
        uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale;
        uint64_t num = 180000;
        x264_reduce_fraction64( &num, &denom );
        rc->hrd_multiply_denom = 180000 / num;

        double bits_required = log2( 180000 / rc->hrd_multiply_denom )
                             + log2( h->sps->vui.i_time_scale )
                             + log2( h->sps->vui.hrd.i_cpb_size_unscaled );
        if( bits_required >= 63 )
        {
            x264_log( h, X264_LOG_ERROR, "HRD with very large timescale and bufsize not supported\n" );
            return -1;
        }
    }

    if( rc->rate_tolerance < 0.01 )
    {
        x264_log( h, X264_LOG_WARNING, "bitrate tolerance too small, using .01\n" );
        rc->rate_tolerance = 0.01;
    }

    h->mb.b_variable_qp = rc->b_vbv || h->param.rc.i_aq_mode;

    if( rc->b_abr )
    {
        /* FIXME ABR_INIT_QP is actually used only in CRF */
#define ABR_INIT_QP (( h->param.rc.i_rc_method == X264_RC_CRF ? h->param.rc.f_rf_constant : 24 ) + QP_BD_OFFSET)
        rc->accum_p_norm = .01;
        rc->accum_p_qp = ABR_INIT_QP * rc->accum_p_norm;
        /* estimated ratio that produces a reasonable QP for the first I-frame */
        rc->cplxr_sum = .01 * pow( 7.0e5, rc->qcompress ) * pow( h->mb.i_mb_count, 0.5 );
        rc->wanted_bits_window = 1.0 * rc->bitrate / rc->fps;
        rc->last_non_b_pict_type = SLICE_TYPE_I;
    }

    rc->ip_offset = 6.0 * log2f( h->param.rc.f_ip_factor );
    rc->pb_offset = 6.0 * log2f( h->param.rc.f_pb_factor );
    rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
    rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, QP_MAX );
    rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, QP_MAX );
    h->mb.ip_offset = rc->ip_offset + 0.5;

    rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
    rc->last_qscale = qp2qscale( 26 );
    int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
    CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
    CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
    for( int i = 0; i < 3; i++ )
    {
        rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
        rc->lmin[i] = qp2qscale( h->param.rc.i_qp_min );
        rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
        for( int j = 0; j < num_preds; j++ )
        {
            rc->pred[i+j*5].coeff= 2.0;
            rc->pred[i+j*5].count= 1.0;
            rc->pred[i+j*5].decay= 0.5;
            rc->pred[i+j*5].offset= 0.0;
        }
        for( int j = 0; j < 2; j++ )
        {
            rc->row_preds[i][j].coeff= .25;
            rc->row_preds[i][j].count= 1.0;
            rc->row_preds[i][j].decay= 0.5;
            rc->row_preds[i][j].offset= 0.0;
        }
    }
    *rc->pred_b_from_p = rc->pred[0];

    if( parse_zones( h ) < 0 )
    {
        x264_log( h, X264_LOG_ERROR, "failed to parse zones\n" );
        return -1;
    }

    /* Load stat file and init 2pass algo */
    if( h->param.rc.b_stat_read )
    {
        char *p, *stats_in, *stats_buf;

        /* read 1st pass stats */
        assert( h->param.rc.psz_stat_in );
        stats_buf = stats_in = x264_slurp_file( h->param.rc.psz_stat_in );
        if( !stats_buf )
        {
            x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n" );
            return -1;
        }
        if( h->param.rc.b_mb_tree )
        {
            char *mbtree_stats_in = x264_strcat_filename( h->param.rc.psz_stat_in, ".mbtree" );
            if( !mbtree_stats_in )
                return -1;
            rc->p_mbtree_stat_file_in = fopen( mbtree_stats_in, "rb" );
            x264_free( mbtree_stats_in );
            if( !rc->p_mbtree_stat_file_in )
            {
                x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n" );
                return -1;
            }
        }

        /* check whether 1st pass options were compatible with current options */
        if( strncmp( stats_buf, "#options:", 9 ) )
        {
            x264_log( h, X264_LOG_ERROR, "options list in stats file not valid\n" );
            return -1;
        }

        float res_factor, res_factor_bits;
        {
            int i, j;
            uint32_t k, l;
            char *opts = stats_buf;
            stats_in = strchr( stats_buf, '\n' );
            if( !stats_in )
                return -1;
            *stats_in = '\0';
            stats_in++;
            if( sscanf( opts, "#options: %dx%d", &i, &j ) != 2 )
            {
                x264_log( h, X264_LOG_ERROR, "resolution specified in stats file not valid\n" );
                return -1;
            }
            else if( h->param.rc.b_mb_tree && (i != h->param.i_width || j != h->param.i_height)  )
            {
                x264_log( h, X264_LOG_ERROR, "MB-tree doesn't support different resolution than 1st pass (%dx%d vs %dx%d)\n",
                          h->param.i_width, h->param.i_height, i, j );
                return -1;
            }
            res_factor = (float)h->param.i_width * h->param.i_height / (i*j);
            /* Change in bits relative to resolution isn't quite linear on typical sources,
             * so we'll at least try to roughly approximate this effect. */
            res_factor_bits = powf( res_factor, 0.7 );

            if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%u/%u", &k, &l ) != 2 )
            {
                x264_log( h, X264_LOG_ERROR, "timebase specified in stats file not valid\n" );
                return -1;
            }
            if( k != h->param.i_timebase_num || l != h->param.i_timebase_den )
            {
                x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%u/%u vs %u/%u)\n",
                          h->param.i_timebase_num, h->param.i_timebase_den, k, l );
                return -1;
            }

            CMP_OPT_FIRST_PASS( "bitdepth", BIT_DEPTH );
            CMP_OPT_FIRST_PASS( "weightp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) );
            CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe );
            CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid );
            CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh );
            CMP_OPT_FIRST_PASS( "open_gop", h->param.b_open_gop );
            CMP_OPT_FIRST_PASS( "bluray_compat", h->param.b_bluray_compat );

            if( (p = strstr( opts, "interlaced=" )) )
            {
                char *current = h->param.b_interlaced ? h->param.b_tff ? "tff" : "bff" : h->param.b_fake_interlaced ? "fake" : "0";
                char buf[5];
                sscanf( p, "interlaced=%4s", buf );
                if( strcmp( current, buf ) )
                {
                    x264_log( h, X264_LOG_ERROR, "different interlaced setting than first pass (%s vs %s)\n", current, buf );
                    return -1;
                }
            }

            if( (p = strstr( opts, "keyint=" )) )
            {
                p += 7;
                char buf[13] = "infinite ";
                if( h->param.i_keyint_max != X264_KEYINT_MAX_INFINITE )
                    sprintf( buf, "%d ", h->param.i_keyint_max );
                if( strncmp( p, buf, strlen(buf) ) )
                {
                    x264_log( h, X264_LOG_ERROR, "different keyint setting than first pass (%.*s vs %.*s)\n",
                              strlen(buf)-1, buf, strcspn(p, " "), p );
                    return -1;
                }
            }

            if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR )
                x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" );

            if( !strstr( opts, "direct=3" ) && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO )
            {
                x264_log( h, X264_LOG_WARNING, "direct=auto not used on the first pass\n" );
                h->mb.b_direct_auto_write = 1;
            }

            if( ( p = strstr( opts, "b_adapt=" ) ) && sscanf( p, "b_adapt=%d", &i ) && i >= X264_B_ADAPT_NONE && i <= X264_B_ADAPT_TRELLIS )
                h->param.i_bframe_adaptive = i;
            else if( h->param.i_bframe )
            {
                x264_log( h, X264_LOG_ERROR, "b_adapt method specified in stats file not valid\n" );
                return -1;
            }

            if( (h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size) && ( p = strstr( opts, "rc_lookahead=" ) ) && sscanf( p, "rc_lookahead=%d", &i ) )
                h->param.rc.i_lookahead = i;
        }

        /* find number of pics */
        p = stats_in;
        int num_entries;
        for( num_entries = -1; p; num_entries++ )
            p = strchr( p + 1, ';' );
        if( !num_entries )
        {
            x264_log( h, X264_LOG_ERROR, "empty stats file\n" );
            return -1;
        }
        rc->num_entries = num_entries;

        if( h->param.i_frame_total < rc->num_entries && h->param.i_frame_total > 0 )
        {
            x264_log( h, X264_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
                      h->param.i_frame_total, rc->num_entries );
        }
        if( h->param.i_frame_total > rc->num_entries )
        {
            x264_log( h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
                      h->param.i_frame_total, rc->num_entries );
            return -1;
        }

        CHECKED_MALLOCZERO( rc->entry, rc->num_entries * sizeof(ratecontrol_entry_t) );

        /* init all to skipped p frames */
        for( int i = 0; i < rc->num_entries; i++ )
        {
            ratecontrol_entry_t *rce = &rc->entry[i];
            rce->pict_type = SLICE_TYPE_P;
            rce->qscale = rce->new_qscale = qp2qscale( 20 );
            rce->misc_bits = rc->nmb + 10;
            rce->new_qp = 0;
        }

        /* read stats */
        p = stats_in;
        for( int i = 0; i < rc->num_entries; i++ )
        {
            ratecontrol_entry_t *rce;
            int frame_number;
            char pict_type;
            int e;
            char *next;
            float qp;
            int ref;

            next= strchr(p, ';');
            if( next )
                *next++ = 0; //sscanf is unbelievably slow on long strings
            e = sscanf( p, " in:%d ", &frame_number );

            if( frame_number < 0 || frame_number >= rc->num_entries )
            {
                x264_log( h, X264_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frame_number, i );
                return -1;
            }
            rce = &rc->entry[frame_number];
            rce->direct_mode = 0;

            e += sscanf( p, " in:%*d out:%*d type:%c dur:%"SCNd64" cpbdur:%"SCNd64" q:%f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c",
                   &pict_type, &rce->i_duration, &rce->i_cpb_duration, &qp, &rce->tex_bits,
                   &rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count,
                   &rce->s_count, &rce->direct_mode );
            rce->tex_bits  *= res_factor_bits;
            rce->mv_bits   *= res_factor_bits;
            rce->misc_bits *= res_factor_bits;
            rce->i_count   *= res_factor;
            rce->p_count   *= res_factor;
            rce->s_count   *= res_factor;

            p = strstr( p, "ref:" );
            if( !p )
                goto parse_error;
            p += 4;
            for( ref = 0; ref < 16; ref++ )
            {
                if( sscanf( p, " %d", &rce->refcount[ref] ) != 1 )
                    break;
                p = strchr( p+1, ' ' );
                if( !p )
                    goto parse_error;
            }
            rce->refs = ref;

            /* find weights */
            rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1;
            char *w = strchr( p, 'w' );
            if( w )
            {
                int count = sscanf( w, "w:%hd,%hd,%hd,%hd,%hd,%hd,%hd,%hd",
                                    &rce->i_weight_denom[0], &rce->weight[0][0], &rce->weight[0][1],
                                    &rce->i_weight_denom[1], &rce->weight[1][0], &rce->weight[1][1],
                                    &rce->weight[2][0], &rce->weight[2][1] );
                if( count == 3 )
                    rce->i_weight_denom[1] = -1;
                else if ( count != 8 )
                    rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1;
            }

            if( pict_type != 'b' )
                rce->kept_as_ref = 1;
            switch( pict_type )
            {
                case 'I':
                    rce->frame_type = X264_TYPE_IDR;
                    rce->pict_type  = SLICE_TYPE_I;
                    break;
                case 'i':
                    rce->frame_type = X264_TYPE_I;
                    rce->pict_type  = SLICE_TYPE_I;
                    break;
                case 'P':
                    rce->frame_type = X264_TYPE_P;
                    rce->pict_type  = SLICE_TYPE_P;
                    break;
                case 'B':
                    rce->frame_type = X264_TYPE_BREF;
                    rce->pict_type  = SLICE_TYPE_B;
                    break;
                case 'b':
                    rce->frame_type = X264_TYPE_B;
                    rce->pict_type  = SLICE_TYPE_B;
                    break;
                default:  e = -1; break;
            }
            if( e < 12 )
            {
parse_error:
                x264_log( h, X264_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e );
                return -1;
            }
            rce->qscale = qp2qscale( qp );
            p = next;
        }

        x264_free( stats_buf );

        if( h->param.rc.i_rc_method == X264_RC_ABR )
        {
            if( init_pass2( h ) < 0 )
                return -1;
        } /* else we're using constant quant, so no need to run the bitrate allocation */
    }

    /* Open output file */
    /* If input and output files are the same, output to a temp file
     * and move it to the real name only when it's complete */
    if( h->param.rc.b_stat_write )
    {
        char *p;
        rc->psz_stat_file_tmpname = x264_strcat_filename( h->param.rc.psz_stat_out, ".temp" );
        if( !rc->psz_stat_file_tmpname )
            return -1;

        rc->p_stat_file_out = fopen( rc->psz_stat_file_tmpname, "wb" );
        if( rc->p_stat_file_out == NULL )
        {
            x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n" );
            return -1;
        }

        p = x264_param2string( &h->param, 1 );
        if( p )
            fprintf( rc->p_stat_file_out, "#options: %s\n", p );
        x264_free( p );
        if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
        {
            rc->psz_mbtree_stat_file_tmpname = x264_strcat_filename( h->param.rc.psz_stat_out, ".mbtree.temp" );
            rc->psz_mbtree_stat_file_name = x264_strcat_filename( h->param.rc.psz_stat_out, ".mbtree" );
            if( !rc->psz_mbtree_stat_file_tmpname || !rc->psz_mbtree_stat_file_name )
                return -1;

            rc->p_mbtree_stat_file_out = fopen( rc->psz_mbtree_stat_file_tmpname, "wb" );
            if( rc->p_mbtree_stat_file_out == NULL )
            {
                x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n" );
                return -1;
            }
        }
    }

    if( h->param.rc.b_mb_tree && (h->param.rc.b_stat_read || h->param.rc.b_stat_write) )
    {
        CHECKED_MALLOC( rc->qp_buffer[0], h->mb.i_mb_count * sizeof(uint16_t) );
        if( h->param.i_bframe_pyramid && h->param.rc.b_stat_read )
            CHECKED_MALLOC( rc->qp_buffer[1], h->mb.i_mb_count * sizeof(uint16_t) );
        rc->qpbuf_pos = -1;
    }

    for( int i = 0; i<h->param.i_threads; i++ )
    {
        h->thread[i]->rc = rc+i;
        if( i )
        {
            rc[i] = rc[0];
            h->thread[i]->param = h->param;
            h->thread[i]->mb.b_variable_qp = h->mb.b_variable_qp;
            h->thread[i]->mb.ip_offset = h->mb.ip_offset;
        }
    }

    return 0;
fail:
    return -1;
}

static int parse_zone( x264_t *h, x264_zone_t *z, char *p )
{
    int len = 0;
    char *tok, UNUSED *saveptr=NULL;
    z->param = NULL;
    z->f_bitrate_factor = 1;
    if( 3 <= sscanf(p, "%d,%d,q=%d%n", &z->i_start, &z->i_end, &z->i_qp, &len) )
        z->b_force_qp = 1;
    else if( 3 <= sscanf(p, "%d,%d,b=%f%n", &z->i_start, &z->i_end, &z->f_bitrate_factor, &len) )
        z->b_force_qp = 0;
    else if( 2 <= sscanf(p, "%d,%d%n", &z->i_start, &z->i_end, &len) )
        z->b_force_qp = 0;
    else
    {
        x264_log( h, X264_LOG_ERROR, "invalid zone: \"%s\"\n", p );
        return -1;
    }
    p += len;
    if( !*p )
        return 0;
    CHECKED_MALLOC( z->param, sizeof(x264_param_t) );
    memcpy( z->param, &h->param, sizeof(x264_param_t) );
    z->param->param_free = x264_free;
    while( (tok = strtok_r( p, ",", &saveptr )) )
    {
        char *val = strchr( tok, '=' );
        if( val )
        {
            *val = '\0';
            val++;
        }
        if( x264_param_parse( z->param, tok, val ) )
        {
            x264_log( h, X264_LOG_ERROR, "invalid zone param: %s = %s\n", tok, val );
            return -1;
        }
        p = NULL;
    }
    return 0;
fail:
    return -1;
}

static int parse_zones( x264_t *h )
{
    x264_ratecontrol_t *rc = h->rc;
    if( h->param.rc.psz_zones && !h->param.rc.i_zones )
    {
        char *psz_zones, *p;
        CHECKED_MALLOC( psz_zones, strlen( h->param.rc.psz_zones )+1 );
        strcpy( psz_zones, h->param.rc.psz_zones );
        h->param.rc.i_zones = 1;
        for( p = psz_zones; *p; p++ )
            h->param.rc.i_zones += (*p == '/');
        CHECKED_MALLOC( h->param.rc.zones, h->param.rc.i_zones * sizeof(x264_zone_t) );
        p = psz_zones;
        for( int i = 0; i < h->param.rc.i_zones; i++ )
        {
            int i_tok = strcspn( p, "/" );
            p[i_tok] = 0;
            if( parse_zone( h, &h->param.rc.zones[i], p ) )
                return -1;
            p += i_tok + 1;
        }
        x264_free( psz_zones );
    }

    if( h->param.rc.i_zones > 0 )
    {
        for( int i = 0; i < h->param.rc.i_zones; i++ )
        {
            x264_zone_t z = h->param.rc.zones[i];
            if( z.i_start < 0 || z.i_start > z.i_end )
            {
                x264_log( h, X264_LOG_ERROR, "invalid zone: start=%d end=%d\n",
                          z.i_start, z.i_end );
                return -1;
            }
            else if( !z.b_force_qp && z.f_bitrate_factor <= 0 )
            {
                x264_log( h, X264_LOG_ERROR, "invalid zone: bitrate_factor=%f\n",
                          z.f_bitrate_factor );
                return -1;
            }
        }

        rc->i_zones = h->param.rc.i_zones + 1;
        CHECKED_MALLOC( rc->zones, rc->i_zones * sizeof(x264_zone_t) );
        memcpy( rc->zones+1, h->param.rc.zones, (rc->i_zones-1) * sizeof(x264_zone_t) );

        // default zone to fall back to if none of the others match
        rc->zones[0].i_start = 0;
        rc->zones[0].i_end = INT_MAX;
        rc->zones[0].b_force_qp = 0;
        rc->zones[0].f_bitrate_factor = 1;
        CHECKED_MALLOC( rc->zones[0].param, sizeof(x264_param_t) );
        memcpy( rc->zones[0].param, &h->param, sizeof(x264_param_t) );
        for( int i = 1; i < rc->i_zones; i++ )
        {
            if( !rc->zones[i].param )
                rc->zones[i].param = rc->zones[0].param;
        }
    }

    return 0;
fail:
    return -1;
}

static x264_zone_t *get_zone( x264_t *h, int frame_num )
{
    for( int i = h->rc->i_zones - 1; i >= 0; i-- )
    {
        x264_zone_t *z = &h->rc->zones[i];
        if( frame_num >= z->i_start && frame_num <= z->i_end )
            return z;
    }
    return NULL;
}

void x264_ratecontrol_summary( x264_t *h )
{
    x264_ratecontrol_t *rc = h->rc;
    if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 )
    {
        double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
        double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
        x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
                  qscale2qp( pow( base_cplx, 1 - rc->qcompress )
                             * rc->cplxr_sum / rc->wanted_bits_window ) - mbtree_offset - QP_BD_OFFSET );
    }
}

void x264_ratecontrol_delete( x264_t *h )
{
    x264_ratecontrol_t *rc = h->rc;
    int b_regular_file;

    if( rc->p_stat_file_out )
    {
        b_regular_file = x264_is_regular_file( rc->p_stat_file_out );
        fclose( rc->p_stat_file_out );
        if( h->i_frame >= rc->num_entries && b_regular_file )
            if( rename( rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out ) != 0 )
            {
                x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
                          rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out );
            }
        x264_free( rc->psz_stat_file_tmpname );
    }
    if( rc->p_mbtree_stat_file_out )
    {
        b_regular_file = x264_is_regular_file( rc->p_mbtree_stat_file_out );
        fclose( rc->p_mbtree_stat_file_out );
        if( h->i_frame >= rc->num_entries && b_regular_file )
            if( rename( rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name ) != 0 )
            {
                x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
                          rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name );
            }
        x264_free( rc->psz_mbtree_stat_file_tmpname );
        x264_free( rc->psz_mbtree_stat_file_name );
    }
    if( rc->p_mbtree_stat_file_in )
        fclose( rc->p_mbtree_stat_file_in );
    x264_free( rc->pred );
    x264_free( rc->pred_b_from_p );
    x264_free( rc->entry );
    x264_free( rc->qp_buffer[0] );
    x264_free( rc->qp_buffer[1] );
    if( rc->zones )
    {
        x264_free( rc->zones[0].param );
        for( int i = 1; i < rc->i_zones; i++ )
            if( rc->zones[i].param != rc->zones[0].param && rc->zones[i].param->param_free )
                rc->zones[i].param->param_free( rc->zones[i].param );
        x264_free( rc->zones );
    }
    x264_free( rc );
}

static void accum_p_qp_update( x264_t *h, float qp )
{
    x264_ratecontrol_t *rc = h->rc;
    rc->accum_p_qp   *= .95;
    rc->accum_p_norm *= .95;
    rc->accum_p_norm += 1;
    if( h->sh.i_type == SLICE_TYPE_I )
        rc->accum_p_qp += qp + rc->ip_offset;
    else
        rc->accum_p_qp += qp;
}

/* Before encoding a frame, choose a QP for it */
void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
{
    x264_ratecontrol_t *rc = h->rc;
    ratecontrol_entry_t *rce = NULL;
    x264_zone_t *zone = get_zone( h, h->fenc->i_frame );
    float q;

    x264_emms();

    if( zone && (!rc->prev_zone || zone->param != rc->prev_zone->param) )
        x264_encoder_reconfig( h, zone->param );
    rc->prev_zone = zone;

    if( h->param.rc.b_stat_read )
    {
        int frame = h->fenc->i_frame;
        assert( frame >= 0 && frame < rc->num_entries );
        rce = h->rc->rce = &h->rc->entry[frame];

        if( h->sh.i_type == SLICE_TYPE_B
            && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO )
        {
            h->sh.b_direct_spatial_mv_pred = ( rce->direct_mode == 's' );
            h->mb.b_direct_auto_read = ( rce->direct_mode == 's' || rce->direct_mode == 't' );
        }
    }

    if( rc->b_vbv )
    {
        memset( h->fdec->i_row_bits, 0, h->mb.i_mb_height * sizeof(int) );
        memset( h->fdec->f_row_qp, 0, h->mb.i_mb_height * sizeof(float) );
        memset( h->fdec->f_row_qscale, 0, h->mb.i_mb_height * sizeof(float) );
        rc->row_pred = &rc->row_preds[h->sh.i_type];
        rc->buffer_rate = h->fenc->i_cpb_duration * rc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
        update_vbv_plan( h, overhead );

        const x264_level_t *l = x264_levels;
        while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
            l++;

        int mincr = l->mincr;

        if( h->param.b_bluray_compat )
            mincr = 4;

        /* Profiles above High don't require minCR, so just set the maximum to a large value. */
        if( h->sps->i_profile_idc > PROFILE_HIGH )
            rc->frame_size_maximum = 1e9;
        else
        {
            /* The spec has a bizarre special case for the first frame. */
            if( h->i_frame == 0 )
            {
                //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
                double fr = 1. / 172;
                int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height;
                rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr;
            }
            else
            {
                //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR
                rc->frame_size_maximum = 384 * BIT_DEPTH * ((double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale) * l->mbps / mincr;
            }
        }
    }

    if( h->sh.i_type != SLICE_TYPE_B )
        rc->bframes = h->fenc->i_bframes;

    if( rc->b_abr )
    {
        q = qscale2qp( rate_estimate_qscale( h ) );
    }
    else if( rc->b_2pass )
    {
        rce->new_qscale = rate_estimate_qscale( h );
        q = qscale2qp( rce->new_qscale );
    }
    else /* CQP */
    {
        if( h->sh.i_type == SLICE_TYPE_B && h->fdec->b_kept_as_ref )
            q = ( rc->qp_constant[ SLICE_TYPE_B ] + rc->qp_constant[ SLICE_TYPE_P ] ) / 2;
        else
            q = rc->qp_constant[ h->sh.i_type ];

        if( zone )
        {
            if( zone->b_force_qp )
                q += zone->i_qp - rc->qp_constant[SLICE_TYPE_P];
            else
                q -= 6*log2f( zone->f_bitrate_factor );
        }
    }
    if( i_force_qp != X264_QP_AUTO )
        q = i_force_qp - 1;

    q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );

    rc->qpa_rc =
    rc->qpa_aq = 0;
    rc->qp = x264_clip3( q + 0.5f, 0, QP_MAX );
    h->fdec->f_qp_avg_rc =
    h->fdec->f_qp_avg_aq =
    rc->qpm = q;
    if( rce )
        rce->new_qp = rc->qp;

    accum_p_qp_update( h, rc->qpm );

    if( h->sh.i_type != SLICE_TYPE_B )
        rc->last_non_b_pict_type = h->sh.i_type;
}

static float predict_row_size( x264_t *h, int y, float qscale )
{
    /* average between two predictors:
     * absolute SATD, and scaled bit cost of the colocated row in the previous frame */
    x264_ratecontrol_t *rc = h->rc;
    float pred_s = predict_size( rc->row_pred[0], qscale, h->fdec->i_row_satd[y] );
    if( h->sh.i_type == SLICE_TYPE_I || qscale >= h->fref[0][0]->f_row_qscale[y] )
    {
        if( h->sh.i_type == SLICE_TYPE_P
            && h->fref[0][0]->i_type == h->fdec->i_type
            && h->fref[0][0]->f_row_qscale[y] > 0
            && h->fref[0][0]->i_row_satd[y] > 0
            && (abs(h->fref[0][0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2))
        {
            float pred_t = h->fref[0][0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref[0][0]->i_row_satd[y]
                         * h->fref[0][0]->f_row_qscale[y] / qscale;
            return (pred_s + pred_t) * 0.5f;
        }
        return pred_s;
    }
    /* Our QP is lower than the reference! */
    else
    {
        float pred_intra = predict_size( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] );
        /* Sum: better to overestimate than underestimate by using only one of the two predictors. */
        return pred_intra + pred_s;
    }
}

static int row_bits_so_far( x264_t *h, int y )
{
    int bits = 0;
    for( int i = h->i_threadslice_start; i <= y; i++ )
        bits += h->fdec->i_row_bits[i];
    return bits;
}

static float predict_row_size_sum( x264_t *h, int y, float qp )
{
    float qscale = qp2qscale( qp );
    float bits = row_bits_so_far( h, y );
    for( int i = y+1; i < h->i_threadslice_end; i++ )
        bits += predict_row_size( h, i, qscale );
    return bits;
}

/* TODO:
 *  eliminate all use of qp in row ratecontrol: make it entirely qscale-based.
 *  make this function stop being needlessly O(N^2)
 *  update more often than once per row? */
void x264_ratecontrol_mb( x264_t *h, int bits )
{
    x264_ratecontrol_t *rc = h->rc;
    const int y = h->mb.i_mb_y;

    h->fdec->i_row_bits[y] += bits;
    rc->qpa_aq += h->mb.i_qp;

    if( h->mb.i_mb_x != h->mb.i_mb_width - 1 )
        return;

    x264_emms();
    rc->qpa_rc += rc->qpm * h->mb.i_mb_width;

    if( !rc->b_vbv )
        return;

    float qscale = qp2qscale( rc->qpm );
    h->fdec->f_row_qp[y] = rc->qpm;
    h->fdec->f_row_qscale[y] = qscale;

    update_predictor( rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
    if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref[0][0]->f_row_qp[y] )
        update_predictor( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] );

    /* update ratecontrol per-mbpair in MBAFF */
    if( SLICE_MBAFF && !(y&1) )
        return;

    /* tweak quality based on difference from predicted size */
    if( y < h->i_threadslice_end-1 )
    {
        float prev_row_qp = h->fdec->f_row_qp[y];
        float qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
        float qp_absolute_max = h->param.rc.i_qp_max;
        if( rc->rate_factor_max_increment )
            qp_absolute_max = X264_MIN( qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment );
        float qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, qp_absolute_max );
        float step_size = 0.5f;

        /* B-frames shouldn't use lower QP than their reference frames. */
        if( h->sh.i_type == SLICE_TYPE_B )
        {
            qp_min = X264_MAX( qp_min, X264_MAX( h->fref[0][0]->f_row_qp[y+1], h->fref[1][0]->f_row_qp[y+1] ) );
            rc->qpm = X264_MAX( rc->qpm, qp_min );
        }

        float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
        float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
        float max_frame_error = X264_MAX( 0.05f, 1.0f / h->mb.i_mb_height );
        float size_of_other_slices = 0;
        if( h->param.b_sliced_threads )
        {
            float size_of_other_slices_planned = 0;
            for( int i = 0; i < h->param.i_threads; i++ )
                if( h != h->thread[i] )
                {
                    size_of_other_slices += h->thread[i]->rc->frame_size_estimated;
                    size_of_other_slices_planned += h->thread[i]->rc->slice_size_planned;
                }
            float weight = rc->slice_size_planned / rc->frame_size_planned;
            size_of_other_slices = (size_of_other_slices - size_of_other_slices_planned) * weight + size_of_other_slices_planned;
        }

        /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
        float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
        float b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;

        /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
        /* area at the top of the frame was measured inaccurately. */
        if( row_bits_so_far( h, y ) < 0.05f * slice_size_planned )
            return;

        if( h->sh.i_type != SLICE_TYPE_I )
            rc_tol *= 0.5f;

        if( !rc->b_vbv_min_rate )
            qp_min = X264_MAX( qp_min, rc->qp_novbv );

        while( rc->qpm < qp_max
               && ((b1 > rc->frame_size_planned + rc_tol) ||
                   (rc->buffer_fill - b1 < buffer_left_planned * 0.5f) ||
                   (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
        {
            rc->qpm += step_size;
            b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
        }

        while( rc->qpm > qp_min
               && (rc->qpm > h->fdec->f_row_qp[0] || rc->single_frame_vbv)
               && ((b1 < rc->frame_size_planned * 0.8f && rc->qpm <= prev_row_qp)
               || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1f) )
        {
            rc->qpm -= step_size;
            b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
        }

        /* avoid VBV underflow or MinCR violation */
        while( (rc->qpm < qp_absolute_max)
               && ((rc->buffer_fill - b1 < rc->buffer_rate * max_frame_error) ||
                   (rc->frame_size_maximum - b1 < rc->frame_size_maximum * max_frame_error)))
        {
            rc->qpm += step_size;
            b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
        }

        h->rc->frame_size_estimated = b1 - size_of_other_slices;
    }
    else
        h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm );
}

int x264_ratecontrol_qp( x264_t *h )
{
    x264_emms();
    return x264_clip3( h->rc->qpm + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
}

int x264_ratecontrol_mb_qp( x264_t *h )
{
    x264_emms();
    float qp = h->rc->qpm;
    if( h->param.rc.i_aq_mode )
    {
         /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */
        float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy];
        /* Scale AQ's effect towards zero in emergency mode. */
        if( qp > QP_MAX_SPEC )
            qp_offset *= (QP_MAX - qp) / (QP_MAX - QP_MAX_SPEC);
        qp += qp_offset;
    }
    return x264_clip3( qp + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
}

/* In 2pass, force the same frame types as in the 1st pass */
int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
{
    x264_ratecontrol_t *rc = h->rc;
    if( h->param.rc.b_stat_read )
    {
        if( frame_num >= rc->num_entries )
        {
            /* We could try to initialize everything required for ABR and
             * adaptive B-frames, but that would be complicated.
             * So just calculate the average QP used so far. */
            h->param.rc.i_qp_constant = (h->stat.i_frame_count[SLICE_TYPE_P] == 0) ? 24 + QP_BD_OFFSET
                                      : 1 + h->stat.f_frame_qp[SLICE_TYPE_P] / h->stat.i_frame_count[SLICE_TYPE_P];
            rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX );
            rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, QP_MAX );
            rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, QP_MAX );

            x264_log( h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries );
            x264_log( h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant );
            if( h->param.i_bframe_adaptive )
                x264_log( h, X264_LOG_ERROR, "disabling adaptive B-frames\n" );

            for( int i = 0; i < h->param.i_threads; i++ )
            {
                h->thread[i]->rc->b_abr = 0;
                h->thread[i]->rc->b_2pass = 0;
                h->thread[i]->param.rc.i_rc_method = X264_RC_CQP;
                h->thread[i]->param.rc.b_stat_read = 0;
                h->thread[i]->param.i_bframe_adaptive = 0;
                h->thread[i]->param.i_scenecut_threshold = 0;
                h->thread[i]->param.rc.b_mb_tree = 0;
                if( h->thread[i]->param.i_bframe > 1 )
                    h->thread[i]->param.i_bframe = 1;
            }
            return X264_TYPE_AUTO;
        }
        return rc->entry[frame_num].frame_type;
    }
    else
        return X264_TYPE_AUTO;
}

void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm )
{
    ratecontrol_entry_t *rce = &h->rc->entry[frm->i_frame];
    if( h->param.analyse.i_weighted_pred <= 0 )
        return;

    if( rce->i_weight_denom[0] >= 0 )
        SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0][0], rce->i_weight_denom[0], rce->weight[0][1] );

    if( rce->i_weight_denom[1] >= 0 )
    {
        SET_WEIGHT( frm->weight[0][1], 1, rce->weight[1][0], rce->i_weight_denom[1], rce->weight[1][1] );
        SET_WEIGHT( frm->weight[0][2], 1, rce->weight[2][0], rce->i_weight_denom[1], rce->weight[2][1] );
    }
}

/* After encoding one frame, save stats and update ratecontrol state */
int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
{
    x264_ratecontrol_t *rc = h->rc;
    const int *mbs = h->stat.frame.i_mb_count;

    x264_emms();

    h->stat.frame.i_mb_count_skip = mbs[P_SKIP] + mbs[B_SKIP];
    h->stat.frame.i_mb_count_i = mbs[I_16x16] + mbs[I_8x8] + mbs[I_4x4];
    h->stat.frame.i_mb_count_p = mbs[P_L0] + mbs[P_8x8];
    for( int i = B_DIRECT; i < B_8x8; i++ )
        h->stat.frame.i_mb_count_p += mbs[i];

    h->fdec->f_qp_avg_rc = rc->qpa_rc /= h->mb.i_mb_count;
    h->fdec->f_qp_avg_aq = (float)rc->qpa_aq / h->mb.i_mb_count;

    if( h->param.rc.b_stat_write )
    {
        char c_type = h->sh.i_type==SLICE_TYPE_I ? (h->fenc->i_poc==0 ? 'I' : 'i')
                    : h->sh.i_type==SLICE_TYPE_P ? 'P'
                    : h->fenc->b_kept_as_ref ? 'B' : 'b';
        int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0];
        int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0];
        char c_direct = h->mb.b_direct_auto_write ?
                        ( dir_frame>0 ? 's' : dir_frame<0 ? 't' :
                          dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' )
                        : '-';
        if( fprintf( rc->p_stat_file_out,
                 "in:%d out:%d type:%c dur:%"PRId64" cpbdur:%"PRId64" q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c ref:",
                 h->fenc->i_frame, h->i_frame,
                 c_type, h->fenc->i_duration,
                 h->fenc->i_cpb_duration, rc->qpa_rc,
                 h->stat.frame.i_tex_bits,
                 h->stat.frame.i_mv_bits,
                 h->stat.frame.i_misc_bits,
                 h->stat.frame.i_mb_count_i,
                 h->stat.frame.i_mb_count_p,
                 h->stat.frame.i_mb_count_skip,
                 c_direct) < 0 )
            goto fail;

        /* Only write information for reference reordering once. */
        int use_old_stats = h->param.rc.b_stat_read && rc->rce->refs > 1;
        for( int i = 0; i < (use_old_stats ? rc->rce->refs : h->i_ref[0]); i++ )
        {
            int refcount = use_old_stats         ? rc->rce->refcount[i]
                         : PARAM_INTERLACED      ? h->stat.frame.i_mb_count_ref[0][i*2]
                                                 + h->stat.frame.i_mb_count_ref[0][i*2+1]
                         :                         h->stat.frame.i_mb_count_ref[0][i];
            if( fprintf( rc->p_stat_file_out, "%d ", refcount ) < 0 )
                goto fail;
        }

        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->sh.weight[0][0].weightfn )
        {
            if( fprintf( rc->p_stat_file_out, "w:%d,%d,%d",
                         h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
                goto fail;
            if( h->sh.weight[0][1].weightfn || h->sh.weight[0][2].weightfn )
            {
                if( fprintf( rc->p_stat_file_out, ",%d,%d,%d,%d,%d ",
                             h->sh.weight[0][1].i_denom, h->sh.weight[0][1].i_scale, h->sh.weight[0][1].i_offset,
                             h->sh.weight[0][2].i_scale, h->sh.weight[0][2].i_offset ) < 0 )
                    goto fail;
            }
            else if( fprintf( rc->p_stat_file_out, " " ) < 0 )
                goto fail;
        }

        if( fprintf( rc->p_stat_file_out, ";\n") < 0 )
            goto fail;

        /* Don't re-write the data in multi-pass mode. */
        if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read )
        {
            uint8_t i_type = h->sh.i_type;
            /* Values are stored as big-endian FIX8.8 */
            for( int i = 0; i < h->mb.i_mb_count; i++ )
                rc->qp_buffer[0][i] = endian_fix16( h->fenc->f_qp_offset[i]*256.0 );
            if( fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out ) < 1 )
                goto fail;
            if( fwrite( rc->qp_buffer[0], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_out ) < h->mb.i_mb_count )
                goto fail;
        }
    }

    if( rc->b_abr )
    {
        if( h->sh.i_type != SLICE_TYPE_B )
            rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / rc->last_rceq;
        else
        {
            /* Depends on the fact that B-frame's QP is an offset from the following P-frame's.
             * Not perfectly accurate with B-refs, but good enough. */
            rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / (rc->last_rceq * fabs( h->param.rc.f_pb_factor ));
        }
        rc->cplxr_sum *= rc->cbr_decay;
        rc->wanted_bits_window += h->fenc->f_duration * rc->bitrate;
        rc->wanted_bits_window *= rc->cbr_decay;
    }

    if( rc->b_2pass )
        rc->expected_bits_sum += qscale2bits( rc->rce, qp2qscale( rc->rce->new_qp ) );

    if( h->mb.b_variable_qp )
    {
        if( h->sh.i_type == SLICE_TYPE_B )
        {
            rc->bframe_bits += bits;
            if( h->fenc->b_last_minigop_bframe )
            {
                update_predictor( rc->pred_b_from_p, qp2qscale( rc->qpa_rc ),
                                  h->fref[1][h->i_ref[1]-1]->i_satd, rc->bframe_bits / rc->bframes );
                rc->bframe_bits = 0;
            }
        }
    }

    *filler = update_vbv( h, bits );
    rc->filler_bits_sum += *filler * 8;

    if( h->sps->vui.b_nal_hrd_parameters_present )
    {
        if( h->fenc->i_frame == 0 )
        {
            // access unit initialises the HRD
            h->fenc->hrd_timing.cpb_initial_arrival_time = 0;
            rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay;
            rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset;
            h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit = (double)rc->initial_cpb_removal_delay / 90000;
        }
        else
        {
            h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit + (double)(h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset) *
                                                   h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;

            double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000;
            if( h->fenc->b_keyframe )
            {
                 rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time;
                 rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay;
                 rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset;
            }
            else
                 cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000;

            if( h->sps->vui.hrd.b_cbr_hrd )
                h->fenc->hrd_timing.cpb_initial_arrival_time = rc->previous_cpb_final_arrival_time;
            else
                h->fenc->hrd_timing.cpb_initial_arrival_time = X264_MAX( rc->previous_cpb_final_arrival_time, cpb_earliest_arrival_time );
        }
        int filler_bits = *filler ? X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), *filler )*8 : 0;
        // Equation C-6
        h->fenc->hrd_timing.cpb_final_arrival_time = rc->previous_cpb_final_arrival_time = h->fenc->hrd_timing.cpb_initial_arrival_time +
                                                     (double)(bits + filler_bits) / h->sps->vui.hrd.i_bit_rate_unscaled;

        h->fenc->hrd_timing.dpb_output_time = (double)h->fenc->i_dpb_output_delay * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale +
                                              h->fenc->hrd_timing.cpb_removal_time;
    }

    return 0;
fail:
    x264_log( h, X264_LOG_ERROR, "ratecontrol_end: stats file could not be written to\n" );
    return -1;
}

/****************************************************************************
 * 2 pass functions
 ***************************************************************************/

/**
 * modify the bitrate curve from pass1 for one frame
 */
static double get_qscale(x264_t *h, ratecontrol_entry_t *rce, double rate_factor, int frame_num)
{
    x264_ratecontrol_t *rcc= h->rc;
    x264_zone_t *zone = get_zone( h, frame_num );
    double q;
    if( h->param.rc.b_mb_tree )
    {
        double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
        q = pow( BASE_FRAME_DURATION / CLIP_DURATION(rce->i_duration * timescale), 1 - h->param.rc.f_qcompress );
    }
    else
        q = pow( rce->blurred_complexity, 1 - rcc->qcompress );

    // avoid NaN's in the rc_eq
    if( !isfinite(q) || rce->tex_bits + rce->mv_bits == 0 )
        q = rcc->last_qscale_for[rce->pict_type];
    else
    {
        rcc->last_rceq = q;
        q /= rate_factor;
        rcc->last_qscale = q;
    }

    if( zone )
    {
        if( zone->b_force_qp )
            q = qp2qscale( zone->i_qp );
        else
            q /= zone->f_bitrate_factor;
    }

    return q;
}

static double get_diff_limited_q(x264_t *h, ratecontrol_entry_t *rce, double q, int frame_num)
{
    x264_ratecontrol_t *rcc = h->rc;
    const int pict_type = rce->pict_type;
    x264_zone_t *zone = get_zone( h, frame_num );

    // force I/B quants as a function of P quants
    const double last_p_q    = rcc->last_qscale_for[SLICE_TYPE_P];
    const double last_non_b_q= rcc->last_qscale_for[rcc->last_non_b_pict_type];
    if( pict_type == SLICE_TYPE_I )
    {
        double iq = q;
        double pq = qp2qscale( rcc->accum_p_qp / rcc->accum_p_norm );
        double ip_factor = fabs( h->param.rc.f_ip_factor );
        /* don't apply ip_factor if the following frame is also I */
        if( rcc->accum_p_norm <= 0 )
            q = iq;
        else if( h->param.rc.f_ip_factor < 0 )
            q = iq / ip_factor;
        else if( rcc->accum_p_norm >= 1 )
            q = pq / ip_factor;
        else
            q = rcc->accum_p_norm * pq / ip_factor + (1 - rcc->accum_p_norm) * iq;
    }
    else if( pict_type == SLICE_TYPE_B )
    {
        if( h->param.rc.f_pb_factor > 0 )
            q = last_non_b_q;
        if( !rce->kept_as_ref )
            q *= fabs( h->param.rc.f_pb_factor );
    }
    else if( pict_type == SLICE_TYPE_P
             && rcc->last_non_b_pict_type == SLICE_TYPE_P
             && rce->tex_bits == 0 )
    {
        q = last_p_q;
    }

    /* last qscale / qdiff stuff */
    if( rcc->last_non_b_pict_type == pict_type &&
        (pict_type!=SLICE_TYPE_I || rcc->last_accum_p_norm < 1) )
    {
        double last_q = rcc->last_qscale_for[pict_type];
        double max_qscale = last_q * rcc->lstep;
        double min_qscale = last_q / rcc->lstep;

        if     ( q > max_qscale ) q = max_qscale;
        else if( q < min_qscale ) q = min_qscale;
    }

    rcc->last_qscale_for[pict_type] = q;
    if( pict_type != SLICE_TYPE_B )
        rcc->last_non_b_pict_type = pict_type;
    if( pict_type == SLICE_TYPE_I )
    {
        rcc->last_accum_p_norm = rcc->accum_p_norm;
        rcc->accum_p_norm = 0;
        rcc->accum_p_qp = 0;
    }
    if( pict_type == SLICE_TYPE_P )
    {
        float mask = 1 - pow( (float)rce->i_count / rcc->nmb, 2 );
        rcc->accum_p_qp   = mask * (qscale2qp( q ) + rcc->accum_p_qp);
        rcc->accum_p_norm = mask * (1 + rcc->accum_p_norm);
    }

    if( zone )
    {
        if( zone->b_force_qp )
            q = qp2qscale( zone->i_qp );
        else
            q /= zone->f_bitrate_factor;
    }

    return q;
}

static float predict_size( predictor_t *p, float q, float var )
{
    return (p->coeff*var + p->offset) / (q*p->count);
}

static void update_predictor( predictor_t *p, float q, float var, float bits )
{
    float range = 1.5;
    if( var < 10 )
        return;
    float old_coeff = p->coeff / p->count;
    float new_coeff = bits*q / var;
    float new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeff*range );
    float new_offset = bits*q - new_coeff_clipped * var;
    if( new_offset >= 0 )
        new_coeff = new_coeff_clipped;
    else
        new_offset = 0;
    p->count  *= p->decay;
    p->coeff  *= p->decay;
    p->offset *= p->decay;
    p->count  ++;
    p->coeff  += new_coeff;
    p->offset += new_offset;
}

// update VBV after encoding a frame
static int update_vbv( x264_t *h, int bits )
{
    int filler = 0;
    int bitrate = h->sps->vui.hrd.i_bit_rate_unscaled;
    x264_ratecontrol_t *rcc = h->rc;
    x264_ratecontrol_t *rct = h->thread[0]->rc;
    uint64_t buffer_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;

    if( rcc->last_satd >= h->mb.i_mb_count )
        update_predictor( &rct->pred[h->sh.i_type], qp2qscale( rcc->qpa_rc ), rcc->last_satd, bits );

    if( !rcc->b_vbv )
        return filler;

    rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale;

    if( rct->buffer_fill_final < 0 )
        x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, (double)rct->buffer_fill_final / h->sps->vui.i_time_scale );
    rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
    rct->buffer_fill_final += (uint64_t)bitrate * h->sps->vui.i_num_units_in_tick * h->fenc->i_cpb_duration;

    if( h->sps->vui.hrd.b_cbr_hrd && rct->buffer_fill_final > buffer_size )
    {
        int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8;
        filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale;
        bits = X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8;
        rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale;
    }
    else
        rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, buffer_size );

    return filler;
}

void x264_hrd_fullness( x264_t *h )
{
    x264_ratecontrol_t *rct = h->thread[0]->rc;
    uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale / rct->hrd_multiply_denom;
    uint64_t cpb_state = rct->buffer_fill_final;
    uint64_t cpb_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
    uint64_t multiply_factor = 180000 / rct->hrd_multiply_denom;

    if( rct->buffer_fill_final < 0 || rct->buffer_fill_final > cpb_size )
    {
         x264_log( h, X264_LOG_WARNING, "CPB %s: %.0lf bits in a %.0lf-bit buffer\n",
                   rct->buffer_fill_final < 0 ? "underflow" : "overflow", (float)rct->buffer_fill_final/denom, (float)cpb_size/denom );
    }

    h->initial_cpb_removal_delay = (multiply_factor * cpb_state + denom) / (2*denom);
    h->initial_cpb_removal_delay_offset = (multiply_factor * cpb_size + denom) / (2*denom) - h->initial_cpb_removal_delay;
}

// provisionally update VBV according to the planned size of all frames currently in progress
static void update_vbv_plan( x264_t *h, int overhead )
{
    x264_ratecontrol_t *rcc = h->rc;
    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final / h->sps->vui.i_time_scale;
    if( h->i_thread_frames > 1 )
    {
        int j = h->rc - h->thread[0]->rc;
        for( int i = 1; i < h->i_thread_frames; i++ )
        {
            x264_t *t = h->thread[ (j+i)%h->i_thread_frames ];
            double bits = t->rc->frame_size_planned;
            if( !t->b_thread_active )
                continue;
            bits = X264_MAX(bits, t->rc->frame_size_estimated);
            rcc->buffer_fill -= bits;
            rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 );
            rcc->buffer_fill += t->rc->buffer_rate;
            rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
        }
    }
    rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
    rcc->buffer_fill -= overhead;
}

// apply VBV constraints and clip qscale to between lmin and lmax
static double clip_qscale( x264_t *h, int pict_type, double q )
{
    x264_ratecontrol_t *rcc = h->rc;
    double lmin = rcc->lmin[pict_type];
    double lmax = rcc->lmax[pict_type];
    if( rcc->rate_factor_max_increment )
        lmax = X264_MIN( lmax, qp2qscale( rcc->qp_novbv + rcc->rate_factor_max_increment ) );
    double q0 = q;

    /* B-frames are not directly subject to VBV,
     * since they are controlled by the P-frames' QPs. */

    if( rcc->b_vbv && rcc->last_satd > 0 )
    {
        /* Lookahead VBV: raise the quantizer as necessary such that no frames in
         * the lookahead overflow and such that the buffer is in a reasonable state
         * by the end of the lookahead. */
        if( h->param.rc.i_lookahead )
        {
            int terminate = 0;

            /* Avoid an infinite loop. */
            for( int iterations = 0; iterations < 1000 && terminate != 3; iterations++ )
            {
                double frame_q[3];
                double cur_bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
                double buffer_fill_cur = rcc->buffer_fill - cur_bits;
                double target_fill;
                double total_duration = 0;
                frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q;
                frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor;
                frame_q[2] = frame_q[0] / h->param.rc.f_ip_factor;

                /* Loop over the planned future frames. */
                for( int j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ )
                {
                    total_duration += h->fenc->f_planned_cpb_duration[j];
                    buffer_fill_cur += rcc->vbv_max_rate * h->fenc->f_planned_cpb_duration[j];
                    int i_type = h->fenc->i_planned_type[j];
                    int i_satd = h->fenc->i_planned_satd[j];
                    if( i_type == X264_TYPE_AUTO )
                        break;
                    i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P;
                    cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd );
                    buffer_fill_cur -= cur_bits;
                }
                /* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */
                target_fill = X264_MIN( rcc->buffer_fill + total_duration * rcc->vbv_max_rate * 0.5, rcc->buffer_size * 0.5 );
                if( buffer_fill_cur < target_fill )
                {
                    q *= 1.01;
                    terminate |= 1;
                    continue;
                }
                /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
                target_fill = x264_clip3f( rcc->buffer_fill - total_duration * rcc->vbv_max_rate * 0.5, rcc->buffer_size * 0.8, rcc->buffer_size );
                if( rcc->b_vbv_min_rate && buffer_fill_cur > target_fill )
                {
                    q /= 1.01;
                    terminate |= 2;
                    continue;
                }
                break;
            }
        }
        /* Fallback to old purely-reactive algorithm: no lookahead. */
        else
        {
            if( ( pict_type == SLICE_TYPE_P ||
                ( pict_type == SLICE_TYPE_I && rcc->last_non_b_pict_type == SLICE_TYPE_I ) ) &&
                rcc->buffer_fill/rcc->buffer_size < 0.5 )
            {
                q /= x264_clip3f( 2.0*rcc->buffer_fill/rcc->buffer_size, 0.5, 1.0 );
            }

            /* Now a hard threshold to make sure the frame fits in VBV.
             * This one is mostly for I-frames. */
            double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
            double qf = 1.0;
            /* For small VBVs, allow the frame to use up the entire VBV. */
            double max_fill_factor = h->param.rc.i_vbv_buffer_size >= 5*h->param.rc.i_vbv_max_bitrate / rcc->fps ? 2 : 1;
            /* For single-frame VBVs, request that the frame use up the entire VBV. */
            double min_fill_factor = rcc->single_frame_vbv ? 1 : 2;

            if( bits > rcc->buffer_fill/max_fill_factor )
                qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 );
            q /= qf;
            bits *= qf;
            if( bits < rcc->buffer_rate/min_fill_factor )
                q *= bits*min_fill_factor/rcc->buffer_rate;
            q = X264_MAX( q0, q );
        }

        /* Apply MinCR restrictions */
        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
        if( bits > rcc->frame_size_maximum )
            q *= bits / rcc->frame_size_maximum;
        bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );

        /* Check B-frame complexity, and use up any bits that would
         * overflow before the next P-frame. */
        if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv )
        {
            int nb = rcc->bframes;
            double pbbits = bits;
            double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
            double space;
            double bframe_cpb_duration = 0;
            double minigop_cpb_duration;
            for( int i = 0; i < nb; i++ )
                bframe_cpb_duration += h->fenc->f_planned_cpb_duration[1+i];

            if( bbits * nb > bframe_cpb_duration * rcc->vbv_max_rate )
                nb = 0;
            pbbits += nb * bbits;

            minigop_cpb_duration = bframe_cpb_duration + h->fenc->f_planned_cpb_duration[0];
            space = rcc->buffer_fill + minigop_cpb_duration*rcc->vbv_max_rate - rcc->buffer_size;
            if( pbbits < space )
            {
                q *= X264_MAX( pbbits / space, bits / (0.5 * rcc->buffer_size) );
            }
            q = X264_MAX( q0/2, q );
        }

        if( !rcc->b_vbv_min_rate )
            q = X264_MAX( q0, q );
    }

    if( lmin==lmax )
        return lmin;
    else if( rcc->b_2pass )
    {
        double min2 = log( lmin );
        double max2 = log( lmax );
        q = (log(q) - min2)/(max2-min2) - 0.5;
        q = 1.0/(1.0 + exp( -4*q ));
        q = q*(max2-min2) + min2;
        return exp( q );
    }
    else
        return x264_clip3f( q, lmin, lmax );
}

// update qscale for 1 frame based on actual bits used so far
static float rate_estimate_qscale( x264_t *h )
{
    float q;
    x264_ratecontrol_t *rcc = h->rc;
    ratecontrol_entry_t UNINIT(rce);
    int pict_type = h->sh.i_type;
    int64_t total_bits = 8*(h->stat.i_frame_size[SLICE_TYPE_I]
                          + h->stat.i_frame_size[SLICE_TYPE_P]
                          + h->stat.i_frame_size[SLICE_TYPE_B])
                       - rcc->filler_bits_sum;

    if( rcc->b_2pass )
    {
        rce = *rcc->rce;
        if( pict_type != rce.pict_type )
        {
            x264_log( h, X264_LOG_ERROR, "slice=%c but 2pass stats say %c\n",
                      slice_type_to_char[pict_type], slice_type_to_char[rce.pict_type] );
        }
    }

    if( pict_type == SLICE_TYPE_B )
    {
        /* B-frames don't have independent ratecontrol, but rather get the
         * average QP of the two adjacent P-frames + an offset */

        int i0 = IS_X264_TYPE_I(h->fref_nearest[0]->i_type);
        int i1 = IS_X264_TYPE_I(h->fref_nearest[1]->i_type);
        int dt0 = abs(h->fenc->i_poc - h->fref_nearest[0]->i_poc);
        int dt1 = abs(h->fenc->i_poc - h->fref_nearest[1]->i_poc);
        float q0 = h->fref_nearest[0]->f_qp_avg_rc;
        float q1 = h->fref_nearest[1]->f_qp_avg_rc;

        if( h->fref_nearest[0]->i_type == X264_TYPE_BREF )
            q0 -= rcc->pb_offset/2;
        if( h->fref_nearest[1]->i_type == X264_TYPE_BREF )
            q1 -= rcc->pb_offset/2;

        if( i0 && i1 )
            q = (q0 + q1) / 2 + rcc->ip_offset;
        else if( i0 )
            q = q1;
        else if( i1 )
            q = q0;
        else
            q = (q0*dt1 + q1*dt0) / (dt0 + dt1);

        if( h->fenc->b_kept_as_ref )
            q += rcc->pb_offset/2;
        else
            q += rcc->pb_offset;

        if( rcc->b_2pass && rcc->b_vbv )
            rcc->frame_size_planned = qscale2bits( &rce, qp2qscale( q ) );
        else
            rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, qp2qscale( q ), h->fref[1][h->i_ref[1]-1]->i_satd );
        /* Limit planned size by MinCR */
        if( rcc->b_vbv )
            rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum );
        h->rc->frame_size_estimated = rcc->frame_size_planned;

        /* For row SATDs */
        if( rcc->b_vbv )
            rcc->last_satd = x264_rc_analyse_slice( h );
        rcc->qp_novbv = q;
        return qp2qscale( q );
    }
    else
    {
        double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;

        if( rcc->b_2pass )
        {
            double lmin = rcc->lmin[pict_type];
            double lmax = rcc->lmax[pict_type];
            int64_t diff;
            int64_t predicted_bits = total_bits;

            if( rcc->b_vbv )
            {
                if( h->i_thread_frames > 1 )
                {
                    int j = h->rc - h->thread[0]->rc;
                    for( int i = 1; i < h->i_thread_frames; i++ )
                    {
                        x264_t *t = h->thread[ (j+i)%h->i_thread_frames ];
                        double bits = t->rc->frame_size_planned;
                        if( !t->b_thread_active )
                            continue;
                        bits = X264_MAX(bits, t->rc->frame_size_estimated);
                        predicted_bits += (int64_t)bits;
                    }
                }
            }
            else
            {
                if( h->i_frame < h->i_thread_frames )
                    predicted_bits += (int64_t)h->i_frame * rcc->bitrate / rcc->fps;
                else
                    predicted_bits += (int64_t)(h->i_thread_frames - 1) * rcc->bitrate / rcc->fps;
            }

            /* Adjust ABR buffer based on distance to the end of the video. */
            if( rcc->num_entries > h->i_frame )
            {
                double final_bits = rcc->entry[rcc->num_entries-1].expected_bits;
                double video_pos = rce.expected_bits / final_bits;
                double scale_factor = sqrt( (1 - video_pos) * rcc->num_entries );
                abr_buffer *= 0.5 * X264_MAX( scale_factor, 0.5 );
            }

            diff = predicted_bits - (int64_t)rce.expected_bits;
            q = rce.new_qscale;
            q /= x264_clip3f((double)(abr_buffer - diff) / abr_buffer, .5, 2);
            if( ((h->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
                (rcc->expected_bits_sum > 0))
            {
                /* Adjust quant based on the difference between
                 * achieved and expected bitrate so far */
                double cur_time = (double)h->i_frame / rcc->num_entries;
                double w = x264_clip3f( cur_time*100, 0.0, 1.0 );
                q *= pow( (double)total_bits / rcc->expected_bits_sum, w );
            }
            if( rcc->b_vbv )
            {
                /* Do not overflow vbv */
                double expected_size = qscale2bits( &rce, q );
                double expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size;
                double expected_fullness = rce.expected_vbv / rcc->buffer_size;
                double qmax = q*(2 - expected_fullness);
                double size_constraint = 1 + expected_fullness;
                qmax = X264_MAX( qmax, rce.new_qscale );
                if( expected_fullness < .05 )
                    qmax = lmax;
                qmax = X264_MIN(qmax, lmax);
                while( ((expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax)) ||
                        ((expected_vbv < 0) && (q < lmax)))
                {
                    q *= 1.05;
                    expected_size = qscale2bits(&rce, q);
                    expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size;
                }
                rcc->last_satd = x264_rc_analyse_slice( h );
            }
            q = x264_clip3f( q, lmin, lmax );
        }
        else /* 1pass ABR */
        {
            /* Calculate the quantizer which would have produced the desired
             * average bitrate if it had been applied to all frames so far.
             * Then modulate that quant based on the current frame's complexity
             * relative to the average complexity so far (using the 2pass RCEQ).
             * Then bias the quant up or down if total size so far was far from
             * the target.
             * Result: Depending on the value of rate_tolerance, there is a
             * tradeoff between quality and bitrate precision. But at large
             * tolerances, the bit distribution approaches that of 2pass. */

            double wanted_bits, overflow = 1;

            rcc->last_satd = x264_rc_analyse_slice( h );
            rcc->short_term_cplxsum *= 0.5;
            rcc->short_term_cplxcount *= 0.5;
            rcc->short_term_cplxsum += rcc->last_satd / (CLIP_DURATION(h->fenc->f_duration) / BASE_FRAME_DURATION);
            rcc->short_term_cplxcount ++;

            rce.tex_bits = rcc->last_satd;
            rce.blurred_complexity = rcc->short_term_cplxsum / rcc->short_term_cplxcount;
            rce.mv_bits = 0;
            rce.p_count = rcc->nmb;
            rce.i_count = 0;
            rce.s_count = 0;
            rce.qscale = 1;
            rce.pict_type = pict_type;
            rce.i_duration = h->fenc->i_duration;

            if( h->param.rc.i_rc_method == X264_RC_CRF )
            {
                q = get_qscale( h, &rce, rcc->rate_factor_constant, h->fenc->i_frame );
            }
            else
            {
                q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );

                /* ABR code can potentially be counterproductive in CBR, so just don't bother.
                 * Don't run it if the frame complexity is zero either. */
                if( !rcc->b_vbv_min_rate && rcc->last_satd )
                {
                    // FIXME is it simpler to keep track of wanted_bits in ratecontrol_end?
                    int i_frame_done = h->i_frame + 1 - h->i_thread_frames;
                    double time_done = i_frame_done / rcc->fps;
                    if( h->param.b_vfr_input && i_frame_done > 0 )
                        time_done = ((double)(h->fenc->i_reordered_pts - h->i_reordered_pts_delay)) * h->param.i_timebase_num / h->param.i_timebase_den;
                    wanted_bits = time_done * rcc->bitrate;
                    if( wanted_bits > 0 )
                    {
                        abr_buffer *= X264_MAX( 1, sqrt( time_done ) );
                        overflow = x264_clip3f( 1.0 + (total_bits - wanted_bits) / abr_buffer, .5, 2 );
                        q *= overflow;
                    }
                }
            }

            if( pict_type == SLICE_TYPE_I && h->param.i_keyint_max > 1
                /* should test _next_ pict type, but that isn't decided yet */
                && rcc->last_non_b_pict_type != SLICE_TYPE_I )
            {
                q = qp2qscale( rcc->accum_p_qp / rcc->accum_p_norm );
                q /= fabs( h->param.rc.f_ip_factor );
            }
            else if( h->i_frame > 0 )
            {
                if( h->param.rc.i_rc_method != X264_RC_CRF )
                {
                    /* Asymmetric clipping, because symmetric would prevent
                     * overflow control in areas of rapidly oscillating complexity */
                    double lmin = rcc->last_qscale_for[pict_type] / rcc->lstep;
                    double lmax = rcc->last_qscale_for[pict_type] * rcc->lstep;
                    if( overflow > 1.1 && h->i_frame > 3 )
                        lmax *= rcc->lstep;
                    else if( overflow < 0.9 )
                        lmin /= rcc->lstep;

                    q = x264_clip3f(q, lmin, lmax);
                }
            }
            else if( h->param.rc.i_rc_method == X264_RC_CRF && rcc->qcompress != 1 )
            {
                q = qp2qscale( ABR_INIT_QP ) / fabs( h->param.rc.f_ip_factor );
            }
            rcc->qp_novbv = qscale2qp( q );

            //FIXME use get_diff_limited_q() ?
            q = clip_qscale( h, pict_type, q );
        }

        rcc->last_qscale_for[pict_type] =
        rcc->last_qscale = q;

        if( !(rcc->b_2pass && !rcc->b_vbv) && h->fenc->i_frame == 0 )
            rcc->last_qscale_for[SLICE_TYPE_P] = q * fabs( h->param.rc.f_ip_factor );

        if( rcc->b_2pass && rcc->b_vbv )
            rcc->frame_size_planned = qscale2bits(&rce, q);
        else
            rcc->frame_size_planned = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );

        /* Always use up the whole VBV in this case. */
        if( rcc->single_frame_vbv )
            rcc->frame_size_planned = rcc->buffer_rate;
        /* Limit planned size by MinCR */
        if( rcc->b_vbv )
            rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum );
        h->rc->frame_size_estimated = rcc->frame_size_planned;
        return q;
    }
}

static void x264_threads_normalize_predictors( x264_t *h )
{
    double totalsize = 0;
    for( int i = 0; i < h->param.i_threads; i++ )
        totalsize += h->thread[i]->rc->slice_size_planned;
    double factor = h->rc->frame_size_planned / totalsize;
    for( int i = 0; i < h->param.i_threads; i++ )
        h->thread[i]->rc->slice_size_planned *= factor;
}

void x264_threads_distribute_ratecontrol( x264_t *h )
{
    int row;
    x264_ratecontrol_t *rc = h->rc;

    /* Initialize row predictors */
    if( h->i_frame == 0 )
        for( int i = 0; i < h->param.i_threads; i++ )
        {
            x264_ratecontrol_t *t = h->thread[i]->rc;
            memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) );
        }

    for( int i = 0; i < h->param.i_threads; i++ )
    {
        x264_t *t = h->thread[i];
        memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
        t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
        /* Calculate the planned slice size. */
        if( rc->b_vbv && rc->frame_size_planned )
        {
            int size = 0;
            for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
                size += h->fdec->i_row_satd[row];
            t->rc->slice_size_planned = predict_size( &rc->pred[h->sh.i_type + (i+1)*5], rc->qpm, size );
        }
        else
            t->rc->slice_size_planned = 0;
    }
    if( rc->b_vbv && rc->frame_size_planned )
    {
        x264_threads_normalize_predictors( h );

        if( rc->single_frame_vbv )
        {
            /* Compensate for our max frame error threshold: give more bits (proportionally) to smaller slices. */
            for( int i = 0; i < h->param.i_threads; i++ )
            {
                x264_t *t = h->thread[i];
                float max_frame_error = X264_MAX( 0.05, 1.0 / (t->i_threadslice_end - t->i_threadslice_start) );
                t->rc->slice_size_planned += 2 * max_frame_error * rc->frame_size_planned;
            }
            x264_threads_normalize_predictors( h );
        }

        for( int i = 0; i < h->param.i_threads; i++ )
            h->thread[i]->rc->frame_size_estimated = h->thread[i]->rc->slice_size_planned;
    }
}

void x264_threads_merge_ratecontrol( x264_t *h )
{
    x264_ratecontrol_t *rc = h->rc;
    x264_emms();

    for( int i = 0; i < h->param.i_threads; i++ )
    {
        x264_t *t = h->thread[i];
        x264_ratecontrol_t *rct = h->thread[i]->rc;
        if( h->param.rc.i_vbv_buffer_size )
        {
            int size = 0;
            for( int row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
                size += h->fdec->i_row_satd[row];
            int bits = t->stat.frame.i_mv_bits + t->stat.frame.i_tex_bits + t->stat.frame.i_misc_bits;
            int mb_count = (t->i_threadslice_end - t->i_threadslice_start) * h->mb.i_mb_width;
            update_predictor( &rc->pred[h->sh.i_type+(i+1)*5], qp2qscale( rct->qpa_rc/mb_count ), size, bits );
        }
        if( !i )
            continue;
        rc->qpa_rc += rct->qpa_rc;
        rc->qpa_aq += rct->qpa_aq;
    }
}

void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
{
    if( cur != prev )
    {
#define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
        /* these vars are updated in x264_ratecontrol_start()
         * so copy them from the context that most recently started (prev)
         * to the context that's about to start (cur). */
        COPY(accum_p_qp);
        COPY(accum_p_norm);
        COPY(last_satd);
        COPY(last_rceq);
        COPY(last_qscale_for);
        COPY(last_non_b_pict_type);
        COPY(short_term_cplxsum);
        COPY(short_term_cplxcount);
        COPY(bframes);
        COPY(prev_zone);
        COPY(qpbuf_pos);
        /* these vars can be updated by x264_ratecontrol_init_reconfigurable */
        COPY(bitrate);
        COPY(buffer_size);
        COPY(buffer_rate);
        COPY(vbv_max_rate);
        COPY(single_frame_vbv);
        COPY(cbr_decay);
        COPY(rate_factor_constant);
        COPY(rate_factor_max_increment);
#undef COPY
    }
    if( cur != next )
    {
#define COPY(var) next->rc->var = cur->rc->var
        /* these vars are updated in x264_ratecontrol_end()
         * so copy them from the context that most recently ended (cur)
         * to the context that's about to end (next) */
        COPY(cplxr_sum);
        COPY(expected_bits_sum);
        COPY(filler_bits_sum);
        COPY(wanted_bits_window);
        COPY(bframe_bits);
        COPY(initial_cpb_removal_delay);
        COPY(initial_cpb_removal_delay_offset);
        COPY(nrt_first_access_unit);
        COPY(previous_cpb_final_arrival_time);
#undef COPY
    }
    //FIXME row_preds[] (not strictly necessary, but would improve prediction)
    /* the rest of the variables are either constant or thread-local */
}

static int find_underflow( x264_t *h, double *fills, int *t0, int *t1, int over )
{
    /* find an interval ending on an overflow or underflow (depending on whether
     * we're adding or removing bits), and starting on the earliest frame that
     * can influence the buffer fill of that end frame. */
    x264_ratecontrol_t *rcc = h->rc;
    const double buffer_min = (over ? .1 : .1) * rcc->buffer_size;
    const double buffer_max = .9 * rcc->buffer_size;
    double fill = fills[*t0-1];
    double parity = over ? 1. : -1.;
    int start = -1, end = -1;
    for( int i = *t0; i < rcc->num_entries; i++ )
    {
        fill += (rcc->entry[i].i_cpb_duration * rcc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale -
                 qscale2bits( &rcc->entry[i], rcc->entry[i].new_qscale )) * parity;
        fill = x264_clip3f(fill, 0, rcc->buffer_size);
        fills[i] = fill;
        if( fill <= buffer_min || i == 0 )
        {
            if( end >= 0 )
                break;
            start = i;
        }
        else if( fill >= buffer_max && start >= 0 )
            end = i;
    }
    *t0 = start;
    *t1 = end;
    return start >= 0 && end >= 0;
}

static int fix_underflow( x264_t *h, int t0, int t1, double adjustment, double qscale_min, double qscale_max)
{
    x264_ratecontrol_t *rcc = h->rc;
    double qscale_orig, qscale_new;
    int adjusted = 0;
    if( t0 > 0 )
        t0++;
    for( int i = t0; i <= t1; i++ )
    {
        qscale_orig = rcc->entry[i].new_qscale;
        qscale_orig = x264_clip3f( qscale_orig, qscale_min, qscale_max );
        qscale_new  = qscale_orig * adjustment;
        qscale_new  = x264_clip3f( qscale_new, qscale_min, qscale_max );
        rcc->entry[i].new_qscale = qscale_new;
        adjusted = adjusted || (qscale_new != qscale_orig);
    }
    return adjusted;
}

static double count_expected_bits( x264_t *h )
{
    x264_ratecontrol_t *rcc = h->rc;
    double expected_bits = 0;
    for( int i = 0; i < rcc->num_entries; i++ )
    {
        ratecontrol_entry_t *rce = &rcc->entry[i];
        rce->expected_bits = expected_bits;
        expected_bits += qscale2bits( rce, rce->new_qscale );
    }
    return expected_bits;
}

static int vbv_pass2( x264_t *h, double all_available_bits )
{
    /* for each interval of buffer_full .. underflow, uniformly increase the qp of all
     * frames in the interval until either buffer is full at some intermediate frame or the
     * last frame in the interval no longer underflows.  Recompute intervals and repeat.
     * Then do the converse to put bits back into overflow areas until target size is met */

    x264_ratecontrol_t *rcc = h->rc;
    double *fills;
    double expected_bits = 0;
    double adjustment;
    double prev_bits = 0;
    int t0, t1;
    double qscale_min = qp2qscale( h->param.rc.i_qp_min );
    double qscale_max = qp2qscale( h->param.rc.i_qp_max );
    int iterations = 0;
    int adj_min, adj_max;
    CHECKED_MALLOC( fills, (rcc->num_entries+1)*sizeof(double) );

    fills++;

    /* adjust overall stream size */
    do
    {
        iterations++;
        prev_bits = expected_bits;

        if( expected_bits )
        {   /* not first iteration */
            adjustment = X264_MAX(X264_MIN(expected_bits / all_available_bits, 0.999), 0.9);
            fills[-1] = rcc->buffer_size * h->param.rc.f_vbv_buffer_init;
            t0 = 0;
            /* fix overflows */
            adj_min = 1;
            while(adj_min && find_underflow( h, fills, &t0, &t1, 1 ))
            {
                adj_min = fix_underflow( h, t0, t1, adjustment, qscale_min, qscale_max );
                t0 = t1;
            }
        }

        fills[-1] = rcc->buffer_size * (1. - h->param.rc.f_vbv_buffer_init);
        t0 = 0;
        /* fix underflows -- should be done after overflow, as we'd better undersize target than underflowing VBV */
        adj_max = 1;
        while( adj_max && find_underflow( h, fills, &t0, &t1, 0 ) )
            adj_max = fix_underflow( h, t0, t1, 1.001, qscale_min, qscale_max );

        expected_bits = count_expected_bits( h );
    } while( (expected_bits < .995*all_available_bits) && ((int64_t)(expected_bits+.5) > (int64_t)(prev_bits+.5)) );

    if( !adj_max )
        x264_log( h, X264_LOG_WARNING, "vbv-maxrate issue, qpmax or vbv-maxrate too low\n");

    /* store expected vbv filling values for tracking when encoding */
    for( int i = 0; i < rcc->num_entries; i++ )
        rcc->entry[i].expected_vbv = rcc->buffer_size - fills[i];

    x264_free( fills-1 );
    return 0;
fail:
    return -1;
}

static int init_pass2( x264_t *h )
{
    x264_ratecontrol_t *rcc = h->rc;
    uint64_t all_const_bits = 0;
    double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
    double duration = 0;
    for( int i = 0; i < rcc->num_entries; i++ )
        duration += rcc->entry[i].i_duration;
    duration *= timescale;
    uint64_t all_available_bits = h->param.rc.i_bitrate * 1000. * duration;
    double rate_factor, step_mult;
    double qblur = h->param.rc.f_qblur;
    double cplxblur = h->param.rc.f_complexity_blur;
    const int filter_size = (int)(qblur*4) | 1;
    double expected_bits;
    double *qscale, *blurred_qscale;
    double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);

    /* find total/average complexity & const_bits */
    for( int i = 0; i < rcc->num_entries; i++ )
    {
        ratecontrol_entry_t *rce = &rcc->entry[i];
        all_const_bits += rce->misc_bits;
    }

    if( all_available_bits < all_const_bits)
    {
        x264_log( h, X264_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
                 (int)(all_const_bits * rcc->fps / (rcc->num_entries * 1000.)) );
        return -1;
    }

    /* Blur complexities, to reduce local fluctuation of QP.
     * We don't blur the QPs directly, because then one very simple frame
     * could drag down the QP of a nearby complex frame and give it more
     * bits than intended. */
    for( int i = 0; i < rcc->num_entries; i++ )
    {
        ratecontrol_entry_t *rce = &rcc->entry[i];
        double weight_sum = 0;
        double cplx_sum = 0;
        double weight = 1.0;
        double gaussian_weight;
        /* weighted average of cplx of future frames */
        for( int j = 1; j < cplxblur*2 && j < rcc->num_entries-i; j++ )
        {
            ratecontrol_entry_t *rcj = &rcc->entry[i+j];
            double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
            weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
            if( weight < .0001 )
                break;
            gaussian_weight = weight * exp( -j*j/200.0 );
            weight_sum += gaussian_weight;
            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
        }
        /* weighted average of cplx of past frames */
        weight = 1.0;
        for( int j = 0; j <= cplxblur*2 && j <= i; j++ )
        {
            ratecontrol_entry_t *rcj = &rcc->entry[i-j];
            double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
            gaussian_weight = weight * exp( -j*j/200.0 );
            weight_sum += gaussian_weight;
            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
            weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
            if( weight < .0001 )
                break;
        }
        rce->blurred_complexity = cplx_sum / weight_sum;
    }

    CHECKED_MALLOC( qscale, sizeof(double)*rcc->num_entries );
    if( filter_size > 1 )
        CHECKED_MALLOC( blurred_qscale, sizeof(double)*rcc->num_entries );
    else
        blurred_qscale = qscale;

    /* Search for a factor which, when multiplied by the RCEQ values from
     * each frame, adds up to the desired total size.
     * There is no exact closed-form solution because of VBV constraints and
     * because qscale2bits is not invertible, but we can start with the simple
     * approximation of scaling the 1st pass by the ratio of bitrates.
     * The search range is probably overkill, but speed doesn't matter here. */

    expected_bits = 1;
    for( int i = 0; i < rcc->num_entries; i++ )
    {
        double q = get_qscale(h, &rcc->entry[i], 1.0, i);
        expected_bits += qscale2bits(&rcc->entry[i], q);
        rcc->last_qscale_for[rcc->entry[i].pict_type] = q;
    }
    step_mult = all_available_bits / expected_bits;

    rate_factor = 0;
    for( double step = 1E4 * step_mult; step > 1E-7 * step_mult; step *= 0.5)
    {
        expected_bits = 0;
        rate_factor += step;

        rcc->last_non_b_pict_type = -1;
        rcc->last_accum_p_norm = 1;
        rcc->accum_p_norm = 0;

        rcc->last_qscale_for[0] =
        rcc->last_qscale_for[1] =
        rcc->last_qscale_for[2] = pow( base_cplx, 1 - rcc->qcompress ) / rate_factor;

        /* find qscale */
        for( int i = 0; i < rcc->num_entries; i++ )
        {
            qscale[i] = get_qscale( h, &rcc->entry[i], rate_factor, -1 );
            rcc->last_qscale_for[rcc->entry[i].pict_type] = qscale[i];
        }

        /* fixed I/B qscale relative to P */
        for( int i = rcc->num_entries-1; i >= 0; i-- )
        {
            qscale[i] = get_diff_limited_q( h, &rcc->entry[i], qscale[i], i );
            assert(qscale[i] >= 0);
        }

        /* smooth curve */
        if( filter_size > 1 )
        {
            assert( filter_size%2 == 1 );
            for( int i = 0; i < rcc->num_entries; i++ )
            {
                ratecontrol_entry_t *rce = &rcc->entry[i];
                double q = 0.0, sum = 0.0;

                for( int j = 0; j < filter_size; j++ )
                {
                    int idx = i+j-filter_size/2;
                    double d = idx-i;
                    double coeff = qblur==0 ? 1.0 : exp( -d*d/(qblur*qblur) );
                    if( idx < 0 || idx >= rcc->num_entries )
                        continue;
                    if( rce->pict_type != rcc->entry[idx].pict_type )
                        continue;
                    q += qscale[idx] * coeff;
                    sum += coeff;
                }
                blurred_qscale[i] = q/sum;
            }
        }

        /* find expected bits */
        for( int i = 0; i < rcc->num_entries; i++ )
        {
            ratecontrol_entry_t *rce = &rcc->entry[i];
            rce->new_qscale = clip_qscale( h, rce->pict_type, blurred_qscale[i] );
            assert(rce->new_qscale >= 0);
            expected_bits += qscale2bits( rce, rce->new_qscale );
        }

        if( expected_bits > all_available_bits )
            rate_factor -= step;
    }

    x264_free( qscale );
    if( filter_size > 1 )
        x264_free( blurred_qscale );

    if( rcc->b_vbv )
        if( vbv_pass2( h, all_available_bits ) )
            return -1;
    expected_bits = count_expected_bits( h );

    if( fabs( expected_bits/all_available_bits - 1.0 ) > 0.01 )
    {
        double avgq = 0;
        for( int i = 0; i < rcc->num_entries; i++ )
            avgq += rcc->entry[i].new_qscale;
        avgq = qscale2qp( avgq / rcc->num_entries );

        if( expected_bits > all_available_bits || !rcc->b_vbv )
            x264_log( h, X264_LOG_WARNING, "Error: 2pass curve failed to converge\n" );
        x264_log( h, X264_LOG_WARNING, "target: %.2f kbit/s, expected: %.2f kbit/s, avg QP: %.4f\n",
                  (float)h->param.rc.i_bitrate,
                  expected_bits * rcc->fps / (rcc->num_entries * 1000.),
                  avgq );
        if( expected_bits < all_available_bits && avgq < h->param.rc.i_qp_min + 2 )
        {
            if( h->param.rc.i_qp_min > 0 )
                x264_log( h, X264_LOG_WARNING, "try reducing target bitrate or reducing qp_min (currently %d)\n", h->param.rc.i_qp_min );
            else
                x264_log( h, X264_LOG_WARNING, "try reducing target bitrate\n" );
        }
        else if( expected_bits > all_available_bits && avgq > h->param.rc.i_qp_max - 2 )
        {
            if( h->param.rc.i_qp_max < QP_MAX )
                x264_log( h, X264_LOG_WARNING, "try increasing target bitrate or increasing qp_max (currently %d)\n", h->param.rc.i_qp_max );
            else
                x264_log( h, X264_LOG_WARNING, "try increasing target bitrate\n");
        }
        else if( !(rcc->b_2pass && rcc->b_vbv) )
            x264_log( h, X264_LOG_WARNING, "internal error\n" );
    }

    return 0;
fail:
    return -1;
}
x264-snapshot-20120103-2245-stable/encoder/me.h0000644000175000001440000000636411700673342020073 0ustar  videolanusers/*****************************************************************************
 * me.h: motion estimation
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ME_H
#define X264_ME_H

#define COST_MAX (1<<28)
#define COST_MAX64 (1ULL<<60)

typedef struct
{
    /* aligning the first member is a gcc hack to force the struct to be
     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
    /* input */
    ALIGNED_16( int i_pixel );   /* PIXEL_WxH */
    uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
    int      i_ref_cost;
    int      i_ref;
    const x264_weight_t *weight;

    pixel *p_fref[12];
    pixel *p_fref_w;
    pixel *p_fenc[3];
    uint16_t *integral;
    int      i_stride[3];

    ALIGNED_4( int16_t mvp[2] );

    /* output */
    int cost_mv;        /* lambda * nbits for the chosen mv */
    int cost;           /* satd + lambda * nbits */
    ALIGNED_4( int16_t mv[2] );
} ALIGNED_16( x264_me_t );

typedef struct
{
    int sad;
    int16_t mv[2];
} mvsad_t;

void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
#define x264_me_search( h, m, mvc, i_mvc )\
    x264_me_search_ref( h, m, mvc, i_mvc, NULL )

void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh );
void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list );
void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 );
void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );

extern uint16_t *x264_cost_mv_fpel[QP_MAX+1][4];

#define COPY1_IF_LT(x,y)\
if((y)<(x))\
    (x)=(y);

#define COPY2_IF_LT(x,y,a,b)\
if((y)<(x))\
{\
    (x)=(y);\
    (a)=(b);\
}

#define COPY3_IF_LT(x,y,a,b,c,d)\
if((y)<(x))\
{\
    (x)=(y);\
    (a)=(b);\
    (c)=(d);\
}

#define COPY4_IF_LT(x,y,a,b,c,d,e,f)\
if((y)<(x))\
{\
    (x)=(y);\
    (a)=(b);\
    (c)=(d);\
    (e)=(f);\
}

#define COPY2_IF_GT(x,y,a,b)\
if((y)>(x))\
{\
    (x)=(y);\
    (a)=(b);\
}

#endif
x264-snapshot-20120103-2245-stable/encoder/me.c0000644000175000001440000014472711700673342020074 0ustar  videolanusers/*****************************************************************************
 * me.c: motion estimation
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "macroblock.h"
#include "me.h"

/* presets selected from good points on the speed-vs-quality curve of several test videos
 * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
 * where me_* are the number of EPZS iterations run on all candidate block types,
 * and refine_* are run only on the winner.
 * the subme=8,9 values are much higher because any amount of satd search makes
 * up its time by reducing the number of qpel-rd iterations. */
static const uint8_t subpel_iterations[][4] =
   {{0,0,0,0},
    {1,1,0,0},
    {0,1,1,0},
    {0,2,1,0},
    {0,2,1,1},
    {0,2,1,2},
    {0,0,2,2},
    {0,0,2,2},
    {0,0,4,10},
    {0,0,4,10},
    {0,0,4,10},
    {0,0,4,10}};

/* (x-1)%6 */
static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0};
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
static const int8_t hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
static const int8_t square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};

static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );

#define BITS_MVD( mx, my )\
    (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])

#define COST_MV( mx, my )\
{\
    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
                   &p_fref_w[(my)*stride+(mx)], stride )\
             + BITS_MVD(mx,my);\
    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
}

#define COST_MV_HPEL( mx, my ) \
{ \
    int stride2 = 16; \
    pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
}

#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
    pixel *pix_base = p_fref_w + bmx + bmy*stride;\
    h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
        pix_base + (m0x) + (m0y)*stride,\
        pix_base + (m1x) + (m1y)*stride,\
        pix_base + (m2x) + (m2y)*stride,\
        stride, costs );\
    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
}

#define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\
{\
    pixel *pix_base = p_fref_w + bmx + bmy*stride;\
    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
        pix_base + (m0x) + (m0y)*stride,\
        pix_base + (m1x) + (m1y)*stride,\
        pix_base + (m2x) + (m2y)*stride,\
        pix_base + (m3x) + (m3y)*stride,\
        stride, costs );\
    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
    (costs)[3] += BITS_MVD( bmx+(m3x), bmy+(m3y) );\
}

#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
    pixel *pix_base = p_fref_w + omx + omy*stride;\
    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
        pix_base + (m0x) + (m0y)*stride,\
        pix_base + (m1x) + (m1y)*stride,\
        pix_base + (m2x) + (m2y)*stride,\
        pix_base + (m3x) + (m3y)*stride,\
        stride, costs );\
    costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\
    costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\
    costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\
    costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\
    COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\
    COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\
    COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\
    COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
}

#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
{\
    h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
        p_fref_w + (m0x) + (m0y)*stride,\
        p_fref_w + (m1x) + (m1y)*stride,\
        p_fref_w + (m2x) + (m2y)*stride,\
        stride, costs );\
    costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
    costs[1] += p_cost_mvx[(m1x)<<2];\
    costs[2] += p_cost_mvx[(m2x)<<2];\
    COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
    COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
    COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
}

/*  1  */
/* 101 */
/*  1  */
#define DIA1_ITER( mx, my )\
{\
    omx = mx; omy = my;\
    COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\
}

#define CROSS( start, x_max, y_max )\
{\
    int i = start;\
    if( (x_max) <= X264_MIN(mv_x_max-omx, omx-mv_x_min) )\
        for( ; i < (x_max)-2; i+=4 )\
            COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 );\
    for( ; i < (x_max); i+=2 )\
    {\
        if( omx+i <= mv_x_max )\
            COST_MV( omx+i, omy );\
        if( omx-i >= mv_x_min )\
            COST_MV( omx-i, omy );\
    }\
    i = start;\
    if( (y_max) <= X264_MIN(mv_y_max-omy, omy-mv_y_min) )\
        for( ; i < (y_max)-2; i+=4 )\
            COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 );\
    for( ; i < (y_max); i+=2 )\
    {\
        if( omy+i <= mv_y_max )\
            COST_MV( omx, omy+i );\
        if( omy-i >= mv_y_min )\
            COST_MV( omx, omy-i );\
    }\
}

void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
{
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
    const int i_pixel = m->i_pixel;
    const int stride = m->i_stride[0];
    int i_me_range = h->param.analyse.i_me_range;
    int bmx, bmy, bcost;
    int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
    int omx, omy, pmx, pmy;
    pixel *p_fenc = m->p_fenc[0];
    pixel *p_fref_w = m->p_fref_w;
    ALIGNED_ARRAY_16( pixel, pix,[16*16] );

    int costs[16];

    int mv_x_min = h->mb.mv_min_fpel[0];
    int mv_y_min = h->mb.mv_min_fpel[1];
    int mv_x_max = h->mb.mv_max_fpel[0];
    int mv_y_max = h->mb.mv_max_fpel[1];
    int mv_x_min_qpel = mv_x_min << 2;
    int mv_y_min_qpel = mv_y_min << 2;
    int mv_x_max_qpel = mv_x_max << 2;
    int mv_y_max_qpel = mv_y_max << 2;
/* Special version of pack to allow shortcuts in CHECK_MVRANGE */
#define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF))
    uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min );
    uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000;

#define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000))

    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];

    uint32_t pmv;
    bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel );
    bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel );
    pmx = ( bmx + 2 ) >> 2;
    pmy = ( bmy + 2 ) >> 2;
    bcost = COST_MAX;

    /* try extra predictors if provided */
    if( h->mb.i_subpel_refine >= 3 )
    {
        pmv = pack16to32_mask(bmx,bmy);
        if( i_mvc )
            COST_MV_HPEL( bmx, bmy );
        for( int i = 0; i < i_mvc; i++ )
        {
            if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) )
            {
                int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel );
                int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel );
                COST_MV_HPEL( mx, my );
            }
        }
        bmx = ( bpred_mx + 2 ) >> 2;
        bmy = ( bpred_my + 2 ) >> 2;
        COST_MV( bmx, bmy );
    }
    else
    {
        /* check the MVP */
        bmx = pmx;
        bmy = pmy;
        /* Because we are rounding the predicted motion vector to fullpel, there will be
         * an extra MV cost in 15 out of 16 cases.  However, when the predicted MV is
         * chosen as the best predictor, it is often the case that the subpel search will
         * result in a vector at or next to the predicted motion vector.  Therefore, it is
         * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
         * biasing against use of the predicted motion vector. */
        bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
        pmv = pack16to32_mask( bmx, bmy );
        if( i_mvc > 0 )
        {
            ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] );
            x264_predictor_roundclip( mvc_fpel, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
            bcost <<= 4;
            for( int i = 1; i <= i_mvc; i++ )
            {
                if( M32( mvc_fpel[i-1] ) && (pmv != M32( mvc[i-1] )) )
                {
                    int mx = mvc_fpel[i-1][0];
                    int my = mvc_fpel[i-1][1];
                    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
                    cost = (cost << 4) + i;
                    COPY1_IF_LT( bcost, cost );
                }
            }
            if( bcost&15 )
            {
                bmx = mvc_fpel[(bcost&15)-1][0];
                bmy = mvc_fpel[(bcost&15)-1][1];
            }
            bcost >>= 4;
        }
    }

    if( pmv )
        COST_MV( 0, 0 );

    switch( h->mb.i_me_method )
    {
        case X264_ME_DIA:
        {
            /* diamond search, radius 1 */
            bcost <<= 4;
            int i = i_me_range;
            do
            {
                COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs );
                COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
                COPY1_IF_LT( bcost, (costs[1]<<4)+3 );
                COPY1_IF_LT( bcost, (costs[2]<<4)+4 );
                COPY1_IF_LT( bcost, (costs[3]<<4)+12 );
                if( !(bcost&15) )
                    break;
                bmx -= (bcost<<28)>>30;
                bmy -= (bcost<<30)>>30;
                bcost &= ~15;
            } while( --i && CHECK_MVRANGE(bmx, bmy) );
            bcost >>= 4;
            break;
        }

        case X264_ME_HEX:
        {
    me_hex2:
            /* hexagon search, radius 2 */
    #if 0
            for( int i = 0; i < i_me_range/2; i++ )
            {
                omx = bmx; omy = bmy;
                COST_MV( omx-2, omy   );
                COST_MV( omx-1, omy+2 );
                COST_MV( omx+1, omy+2 );
                COST_MV( omx+2, omy   );
                COST_MV( omx+1, omy-2 );
                COST_MV( omx-1, omy-2 );
                if( bmx == omx && bmy == omy )
                    break;
                if( !CHECK_MVRANGE(bmx, bmy) )
                    break;
            }
    #else
            /* equivalent to the above, but eliminates duplicate candidates */

            /* hexagon */
            COST_MV_X3_DIR( -2,0, -1, 2,  1, 2, costs   );
            COST_MV_X3_DIR(  2,0,  1,-2, -1,-2, costs+3 );
            bcost <<= 3;
            COPY1_IF_LT( bcost, (costs[0]<<3)+2 );
            COPY1_IF_LT( bcost, (costs[1]<<3)+3 );
            COPY1_IF_LT( bcost, (costs[2]<<3)+4 );
            COPY1_IF_LT( bcost, (costs[3]<<3)+5 );
            COPY1_IF_LT( bcost, (costs[4]<<3)+6 );
            COPY1_IF_LT( bcost, (costs[5]<<3)+7 );

            if( bcost&7 )
            {
                int dir = (bcost&7)-2;
                bmx += hex2[dir+1][0];
                bmy += hex2[dir+1][1];

                /* half hexagon, not overlapping the previous iteration */
                for( int i = (i_me_range>>1) - 1; i > 0 && CHECK_MVRANGE(bmx, bmy); i-- )
                {
                    COST_MV_X3_DIR( hex2[dir+0][0], hex2[dir+0][1],
                                    hex2[dir+1][0], hex2[dir+1][1],
                                    hex2[dir+2][0], hex2[dir+2][1],
                                    costs );
                    bcost &= ~7;
                    COPY1_IF_LT( bcost, (costs[0]<<3)+1 );
                    COPY1_IF_LT( bcost, (costs[1]<<3)+2 );
                    COPY1_IF_LT( bcost, (costs[2]<<3)+3 );
                    if( !(bcost&7) )
                        break;
                    dir += (bcost&7)-2;
                    dir = mod6m1[dir+1];
                    bmx += hex2[dir+1][0];
                    bmy += hex2[dir+1][1];
                }
            }
            bcost >>= 3;
    #endif
            /* square refine */
            int dir = 0;
            COST_MV_X4_DIR(  0,-1,  0,1, -1,0, 1,0, costs );
            COPY2_IF_LT( bcost, costs[0], dir, 1 );
            COPY2_IF_LT( bcost, costs[1], dir, 2 );
            COPY2_IF_LT( bcost, costs[2], dir, 3 );
            COPY2_IF_LT( bcost, costs[3], dir, 4 );
            COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs );
            COPY2_IF_LT( bcost, costs[0], dir, 5 );
            COPY2_IF_LT( bcost, costs[1], dir, 6 );
            COPY2_IF_LT( bcost, costs[2], dir, 7 );
            COPY2_IF_LT( bcost, costs[3], dir, 8 );
            bmx += square1[dir][0];
            bmy += square1[dir][1];
            break;
        }

        case X264_ME_UMH:
        {
            /* Uneven-cross Multi-Hexagon-grid Search
             * as in JM, except with different early termination */

            static const uint8_t x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };

            int ucost1, ucost2;
            int cross_start = 1;

            /* refine predictors */
            ucost1 = bcost;
            DIA1_ITER( pmx, pmy );
            if( pmx | pmy )
                DIA1_ITER( 0, 0 );

            if( i_pixel == PIXEL_4x4 )
                goto me_hex2;

            ucost2 = bcost;
            if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) )
                DIA1_ITER( bmx, bmy );
            if( bcost == ucost2 )
                cross_start = 3;
            omx = bmx; omy = bmy;

            /* early termination */
#define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) )
            if( bcost == ucost2 && SAD_THRESH(2000) )
            {
                COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
                COST_MV_X4( 2, 0, -1, 1, 1, 1,  0,2 );
                if( bcost == ucost1 && SAD_THRESH(500) )
                    break;
                if( bcost == ucost2 )
                {
                    int range = (i_me_range>>1) | 1;
                    CROSS( 3, range, range );
                    COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
                    COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
                    if( bcost == ucost2 )
                        break;
                    cross_start = range + 2;
                }
            }

            /* adaptive search range */
            if( i_mvc )
            {
                /* range multipliers based on casual inspection of some statistics of
                 * average distance between current predictor and final mv found by ESA.
                 * these have not been tuned much by actual encoding. */
                static const uint8_t range_mul[4][4] =
                {
                    { 3, 3, 4, 4 },
                    { 3, 4, 4, 4 },
                    { 4, 4, 4, 5 },
                    { 4, 4, 5, 6 },
                };
                int mvd;
                int sad_ctx, mvd_ctx;
                int denom = 1;

                if( i_mvc == 1 )
                {
                    if( i_pixel == PIXEL_16x16 )
                        /* mvc is probably the same as mvp, so the difference isn't meaningful.
                         * but prediction usually isn't too bad, so just use medium range */
                        mvd = 25;
                    else
                        mvd = abs( m->mvp[0] - mvc[0][0] )
                            + abs( m->mvp[1] - mvc[0][1] );
                }
                else
                {
                    /* calculate the degree of agreement between predictors. */
                    /* in 16x16, mvc includes all the neighbors used to make mvp,
                     * so don't count mvp separately. */
                    denom = i_mvc - 1;
                    mvd = 0;
                    if( i_pixel != PIXEL_16x16 )
                    {
                        mvd = abs( m->mvp[0] - mvc[0][0] )
                            + abs( m->mvp[1] - mvc[0][1] );
                        denom++;
                    }
                    mvd += x264_predictor_difference( mvc, i_mvc );
                }

                sad_ctx = SAD_THRESH(1000) ? 0
                        : SAD_THRESH(2000) ? 1
                        : SAD_THRESH(4000) ? 2 : 3;
                mvd_ctx = mvd < 10*denom ? 0
                        : mvd < 20*denom ? 1
                        : mvd < 40*denom ? 2 : 3;

                i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] >> 2;
            }

            /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
             * we are still centered on the same place as the DIA2. is this desirable? */
            CROSS( cross_start, i_me_range, i_me_range>>1 );

            COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 );

            /* hexagon grid */
            omx = bmx; omy = bmy;
            const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
            const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
            int i = 1;
            do
            {
                static const int8_t hex4[16][2] = {
                    { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3},
                    {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1},
                    {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1},
                    {-4, 2}, { 4, 2}, {-2, 3}, { 2, 3},
                };

                if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
                                     mv_y_max-omy, omy-mv_y_min ) )
                {
                    for( int j = 0; j < 16; j++ )
                    {
                        int mx = omx + hex4[j][0]*i;
                        int my = omy + hex4[j][1]*i;
                        if( CHECK_MVRANGE(mx, my) )
                            COST_MV( mx, my );
                    }
                }
                else
                {
                    int dir = 0;
                    pixel *pix_base = p_fref_w + omx + (omy-4*i)*stride;
                    int dy = i*stride;
#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
                    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
                            pix_base x0*i+(y0-2*k+4)*dy,\
                            pix_base x1*i+(y1-2*k+4)*dy,\
                            pix_base x2*i+(y2-2*k+4)*dy,\
                            pix_base x3*i+(y3-2*k+4)*dy,\
                            stride, costs+4*k );\
                    pix_base += 2*dy;
#define ADD_MVCOST(k,x,y) costs[k] += p_cost_omvx[x*4*i] + p_cost_omvy[y*4*i]
#define MIN_MV(k,x,y)     COPY2_IF_LT( bcost, costs[k], dir, x*16+(y&15) )
                    SADS( 0, +0,-4, +0,+4, -2,-3, +2,-3 );
                    SADS( 1, -4,-2, +4,-2, -4,-1, +4,-1 );
                    SADS( 2, -4,+0, +4,+0, -4,+1, +4,+1 );
                    SADS( 3, -4,+2, +4,+2, -2,+3, +2,+3 );
                    ADD_MVCOST(  0, 0,-4 );
                    ADD_MVCOST(  1, 0, 4 );
                    ADD_MVCOST(  2,-2,-3 );
                    ADD_MVCOST(  3, 2,-3 );
                    ADD_MVCOST(  4,-4,-2 );
                    ADD_MVCOST(  5, 4,-2 );
                    ADD_MVCOST(  6,-4,-1 );
                    ADD_MVCOST(  7, 4,-1 );
                    ADD_MVCOST(  8,-4, 0 );
                    ADD_MVCOST(  9, 4, 0 );
                    ADD_MVCOST( 10,-4, 1 );
                    ADD_MVCOST( 11, 4, 1 );
                    ADD_MVCOST( 12,-4, 2 );
                    ADD_MVCOST( 13, 4, 2 );
                    ADD_MVCOST( 14,-2, 3 );
                    ADD_MVCOST( 15, 2, 3 );
                    MIN_MV(  0, 0,-4 );
                    MIN_MV(  1, 0, 4 );
                    MIN_MV(  2,-2,-3 );
                    MIN_MV(  3, 2,-3 );
                    MIN_MV(  4,-4,-2 );
                    MIN_MV(  5, 4,-2 );
                    MIN_MV(  6,-4,-1 );
                    MIN_MV(  7, 4,-1 );
                    MIN_MV(  8,-4, 0 );
                    MIN_MV(  9, 4, 0 );
                    MIN_MV( 10,-4, 1 );
                    MIN_MV( 11, 4, 1 );
                    MIN_MV( 12,-4, 2 );
                    MIN_MV( 13, 4, 2 );
                    MIN_MV( 14,-2, 3 );
                    MIN_MV( 15, 2, 3 );
#undef SADS
#undef ADD_MVCOST
#undef MIN_MV
                    if(dir)
                    {
                        bmx = omx + i*(dir>>4);
                        bmy = omy + i*((dir<<28)>>28);
                    }
                }
            } while( ++i <= i_me_range>>2 );
            if( bmy <= mv_y_max && bmy >= mv_y_min && bmx <= mv_x_max && bmx >= mv_x_min )
                goto me_hex2;
            break;
        }

        case X264_ME_ESA:
        case X264_ME_TESA:
        {
            const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
            const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
            const int max_x = X264_MIN( bmx + i_me_range, mv_x_max );
            const int max_y = X264_MIN( bmy + i_me_range, mv_y_max );
            /* SEA is fastest in multiples of 4 */
            const int width = (max_x - min_x + 3) & ~3;
#if 0
            /* plain old exhaustive search */
            for( int my = min_y; my <= max_y; my++ )
                for( int mx = min_x; mx < min_x + width; mx++ )
                    COST_MV( mx, my );
#else
            /* successive elimination by comparing DC before a full SAD,
             * because sum(abs(diff)) >= abs(diff(sum)). */
            uint16_t *sums_base = m->integral;
            ALIGNED_16( static pixel zero[8*FENC_STRIDE] ) = {0};
            ALIGNED_ARRAY_16( int, enc_dc,[4] );
            int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
            int delta = x264_pixel_size[sad_size].w;
            int16_t *xs = h->scratch_buffer;
            int xn;
            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);

            h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
                p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
                FENC_STRIDE, enc_dc );
            if( delta == 4 )
                sums_base += stride * (h->fenc->i_lines[0] + PADV*2);
            if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
                delta *= stride;
            if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
                enc_dc[1] = enc_dc[2];

            if( h->mb.i_me_method == X264_ME_TESA )
            {
                // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
                mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15) + 4);
                int nmvsad = 0, limit;
                int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
                int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
                         + BITS_MVD( bmx, bmy );
                for( int my = min_y; my <= max_y; my++ )
                {
                    int i;
                    int ycost = p_cost_mvy[my<<2];
                    if( bsad <= ycost )
                        continue;
                    bsad -= ycost;
                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
                                               cost_fpel_mvx+min_x, xs, width, bsad * 17 >> 4 );
                    for( i = 0; i < xn-2; i += 3 )
                    {
                        pixel *ref = p_fref_w+min_x+my*stride;
                        int sads[3];
                        h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
                        for( int j = 0; j < 3; j++ )
                        {
                            int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
                            if( sad < bsad*sad_thresh>>3 )
                            {
                                COPY1_IF_LT( bsad, sad );
                                mvsads[nmvsad].sad = sad + ycost;
                                mvsads[nmvsad].mv[0] = min_x+xs[i+j];
                                mvsads[nmvsad].mv[1] = my;
                                nmvsad++;
                            }
                        }
                    }
                    for( ; i < xn; i++ )
                    {
                        int mx = min_x+xs[i];
                        int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+mx+my*stride, stride )
                                + cost_fpel_mvx[xs[i]];
                        if( sad < bsad*sad_thresh>>3 )
                        {
                            COPY1_IF_LT( bsad, sad );
                            mvsads[nmvsad].sad = sad + ycost;
                            mvsads[nmvsad].mv[0] = mx;
                            mvsads[nmvsad].mv[1] = my;
                            nmvsad++;
                        }
                    }
                    bsad += ycost;
                }

                limit = i_me_range >> 1;
                sad_thresh = bsad*sad_thresh>>3;
                while( nmvsad > limit*2 && sad_thresh > bsad )
                {
                    int i;
                    // halve the range if the domain is too large... eh, close enough
                    sad_thresh = (sad_thresh + bsad) >> 1;
                    for( i = 0; i < nmvsad && mvsads[i].sad <= sad_thresh; i++ );
                    for( int j = i; j < nmvsad; j++ )
                    {
                        uint32_t sad;
                        if( WORD_SIZE == 8 && sizeof(mvsad_t) == 8 )
                        {
                            uint64_t mvsad = M64( &mvsads[i] ) = M64( &mvsads[j] );
#if WORDS_BIGENDIAN
                            mvsad >>= 32;
#endif
                            sad = mvsad;
                        }
                        else
                        {
                            sad = mvsads[j].sad;
                            CP32( mvsads[i].mv, mvsads[j].mv );
                            mvsads[i].sad = sad;
                        }
                        i += (sad - (sad_thresh+1)) >> 31;
                    }
                    nmvsad = i;
                }
                while( nmvsad > limit )
                {
                    int bi = 0;
                    for( int i = 1; i < nmvsad; i++ )
                        if( mvsads[i].sad > mvsads[bi].sad )
                            bi = i;
                    nmvsad--;
                    if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
                        CP64( &mvsads[bi], &mvsads[nmvsad] );
                    else
                        mvsads[bi] = mvsads[nmvsad];
                }
                for( int i = 0; i < nmvsad; i++ )
                    COST_MV( mvsads[i].mv[0], mvsads[i].mv[1] );
            }
            else
            {
                // just ADS and SAD
                for( int my = min_y; my <= max_y; my++ )
                {
                    int i;
                    int ycost = p_cost_mvy[my<<2];
                    if( bcost <= ycost )
                        continue;
                    bcost -= ycost;
                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
                                               cost_fpel_mvx+min_x, xs, width, bcost );
                    for( i = 0; i < xn-2; i += 3 )
                        COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
                    bcost += ycost;
                    for( ; i < xn; i++ )
                        COST_MV( min_x+xs[i], my );
                }
            }
#endif
        }
        break;
    }

    /* -> qpel mv */
    if( bpred_cost < bcost )
    {
        m->mv[0] = bpred_mx;
        m->mv[1] = bpred_my;
        m->cost = bpred_cost;
    }
    else
    {
        m->mv[0] = bmx << 2;
        m->mv[1] = bmy << 2;
        m->cost = bcost;
    }

    /* compute the real cost */
    m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
    if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 )
        m->cost += m->cost_mv;

    /* subpel refine */
    if( h->mb.i_subpel_refine >= 2 )
    {
        int hpel = subpel_iterations[h->mb.i_subpel_refine][2];
        int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
        refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
    }
}
#undef COST_MV

void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
{
    int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
    int qpel = subpel_iterations[h->mb.i_subpel_refine][1];

    if( m->i_pixel <= PIXEL_8x8 )
        m->cost -= m->i_ref_cost;

    refine_subpel( h, m, hpel, qpel, NULL, 1 );
}

void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh )
{
    refine_subpel( h, m, 0, X264_MIN( 2, subpel_iterations[h->mb.i_subpel_refine][3] ), p_halfpel_thresh, 0 );
}

#define COST_MV_SAD( mx, my ) \
{ \
    int stride = 16; \
    pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
}

#define COST_MV_SATD( mx, my, dir ) \
if( b_refine_qpel || (dir^1) != odir ) \
{ \
    int stride = 16; \
    pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
    int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    if( b_chroma_me && cost < bcost ) \
    { \
        if( CHROMA444 ) \
        { \
            stride = 16; \
            src = h->mc.get_ref( pix, &stride, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
            cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[1], FENC_STRIDE, src, stride ); \
            if( cost < bcost ) \
            { \
                stride = 16; \
                src = h->mc.get_ref( pix, &stride, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
                cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[2], FENC_STRIDE, src, stride ); \
            } \
        } \
        else \
        { \
            h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \
                             mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
            if( m->weight[1].weightfn ) \
                m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \
            cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
            if( cost < bcost ) \
            { \
                if( m->weight[2].weightfn ) \
                    m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \
                cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
            } \
        } \
    } \
    COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \
}

static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel )
{
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
    const int i_pixel = m->i_pixel;
    const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
    int chromapix = h->luma2chroma_pixel[i_pixel];
    int chroma_v_shift = CHROMA_V_SHIFT;
    int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;

    ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment

    int bmx = m->mv[0];
    int bmy = m->mv[1];
    int bcost = m->cost;
    int odir = -1, bdir;

    /* try the subpel component of the predicted mv */
    if( hpel_iters && h->mb.i_subpel_refine < 3 )
    {
        int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 );
        int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 );
        if( (mx-bmx)|(my-bmy) )
            COST_MV_SAD( mx, my );
    }

    /* halfpel diamond search */
    for( int i = hpel_iters; i > 0; i-- )
    {
        int omx = bmx, omy = bmy;
        int costs[4];
        int stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
        pixel *src0, *src1, *src2, *src3;
        src0 = h->mc.get_ref( pix,    &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
        src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
        src1 = src0 + stride;
        src3 = src2 + 1;
        h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
        COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx  ] + p_cost_mvy[omy-2], bmy, omy-2 );
        COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx  ] + p_cost_mvy[omy+2], bmy, omy+2 );
        COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy  ], bmx, omx-2, bmy, omy );
        COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+2] + p_cost_mvy[omy  ], bmx, omx+2, bmy, omy );
        if( (bmx == omx) & (bmy == omy) )
            break;
    }

    if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) )
    {
        bcost = COST_MAX;
        COST_MV_SATD( bmx, bmy, -1 );
    }

    /* early termination when examining multiple reference frames */
    if( p_halfpel_thresh )
    {
        if( (bcost*7)>>3 > *p_halfpel_thresh )
        {
            m->cost = bcost;
            m->mv[0] = bmx;
            m->mv[1] = bmy;
            // don't need cost_mv
            return;
        }
        else if( bcost < *p_halfpel_thresh )
            *p_halfpel_thresh = bcost;
    }

    /* quarterpel diamond search */
    if( h->mb.i_subpel_refine != 1 )
    {
        bdir = -1;
        for( int i = qpel_iters; i > 0; i-- )
        {
            if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] || bmx <= h->mb.mv_min_spel[0] || bmx >= h->mb.mv_max_spel[0] )
                break;
            odir = bdir;
            int omx = bmx, omy = bmy;
            COST_MV_SATD( omx, omy - 1, 0 );
            COST_MV_SATD( omx, omy + 1, 1 );
            COST_MV_SATD( omx - 1, omy, 2 );
            COST_MV_SATD( omx + 1, omy, 3 );
            if( (bmx == omx) & (bmy == omy) )
                break;
        }
    }
    /* Special simplified case for subme=1 */
    else if( bmy > h->mb.mv_min_spel[1] && bmy < h->mb.mv_max_spel[1] && bmx > h->mb.mv_min_spel[0] && bmx < h->mb.mv_max_spel[0] )
    {
        int costs[4];
        int omx = bmx, omy = bmy;
        /* We have to use mc_luma because all strides must be the same to use fpelcmp_x4 */
        h->mc.mc_luma( pix   , 64, m->p_fref, m->i_stride[0], omx, omy-1, bw, bh, &m->weight[0] );
        h->mc.mc_luma( pix+16, 64, m->p_fref, m->i_stride[0], omx, omy+1, bw, bh, &m->weight[0] );
        h->mc.mc_luma( pix+32, 64, m->p_fref, m->i_stride[0], omx-1, omy, bw, bh, &m->weight[0] );
        h->mc.mc_luma( pix+48, 64, m->p_fref, m->i_stride[0], omx+1, omy, bw, bh, &m->weight[0] );
        h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], pix, pix+16, pix+32, pix+48, 64, costs );
        COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx  ] + p_cost_mvy[omy-1], bmy, omy-1 );
        COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx  ] + p_cost_mvy[omy+1], bmy, omy+1 );
        COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-1] + p_cost_mvy[omy  ], bmx, omx-1, bmy, omy );
        COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+1] + p_cost_mvy[omy  ], bmx, omx+1, bmy, omy );
    }

    m->cost = bcost;
    m->mv[0] = bmx;
    m->mv[1] = bmy;
    m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy];
}

#define BIME_CACHE( dx, dy, list )\
{\
    x264_me_t *m = m##list;\
    int i = 4 + 3*dx + dy;\
    int mvx = bm##list##x+dx;\
    int mvy = bm##list##y+dy;\
    stride[0][list][i] = bw;\
    src[0][list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[0][list][i], &m->p_fref[0],\
                                     m->i_stride[0], mvx, mvy, bw, bh, x264_weight_none );\
    if( rd )\
    {\
        if( CHROMA444 )\
        {\
            stride[1][list][i] = bw;\
            src[1][list][i] = h->mc.get_ref( pixu_buf[list][i], &stride[1][list][i], &m->p_fref[4],\
                                             m->i_stride[1], mvx, mvy, bw, bh, x264_weight_none );\
            stride[2][list][i] = bw;\
            src[2][list][i] = h->mc.get_ref( pixv_buf[list][i], &stride[2][list][i], &m->p_fref[8],\
                                             m->i_stride[2], mvx, mvy, bw, bh, x264_weight_none );\
        }\
        else\
            h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\
                             mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\
    }\
}

#define SATD_THRESH(cost) (cost+(cost>>4))

/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this
 * other than making its iteration count not a compile-time constant. */
int x264_iter_kludge = 0;

static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
{
    int x = i8&1;
    int y = i8>>1;
    int s8 = X264_SCAN8_0 + 2*x + 16*y;
    int16_t *cache0_mv = h->mb.cache.mv[0][s8];
    int16_t *cache1_mv = h->mb.cache.mv[1][s8];
    const int i_pixel = m0->i_pixel;
    const int bw = x264_pixel_size[i_pixel].w;
    const int bh = x264_pixel_size[i_pixel].h;
    ALIGNED_ARRAY_16( pixel, pixy_buf,[2],[9][16*16] );
    ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
    ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
    pixel *src[3][2][9];
    int chromapix = h->luma2chroma_pixel[i_pixel];
    int chroma_v_shift = CHROMA_V_SHIFT;
    int chroma_x = (8 >> CHROMA_H_SHIFT) * x;
    int chroma_y = (8 >> chroma_v_shift) * y;
    pixel *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
    pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE];
    pixel *pixv = &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE];
    int ref0 = h->mb.cache.ref[0][s8];
    int ref1 = h->mb.cache.ref[1][s8];
    const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
    const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
    int stride[3][2][9];
    int bm0x = m0->mv[0];
    int bm0y = m0->mv[1];
    int bm1x = m1->mv[0];
    int bm1y = m1->mv[1];
    int bcost = COST_MAX;
    int mc_list0 = 1, mc_list1 = 1;
    uint64_t bcostrd = COST_MAX64;
    uint16_t amvd;
    /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
    ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
    /* all permutations of an offset in up to 2 of the dimensions */
    ALIGNED_4( static const int8_t dia4d[33][4] ) =
    {
        {0,0,0,0},
        {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
        {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0},
        {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0},
        {1,1,0,0}, {-1,-1,0,0},{1,0,0,1}, {-1,0,0,-1},
        {0,1,0,1}, {0,-1,0,-1},{1,0,1,0}, {-1,0,-1,0},
        {0,0,-1,1},{0,0,1,-1}, {0,-1,1,0},{0,1,-1,0},
        {-1,1,0,0},{1,-1,0,0}, {1,0,0,-1},{-1,0,0,1},
        {0,-1,0,1},{0,1,0,-1}, {-1,0,1,0},{1,0,-1,0},
    };

    if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 ||
        bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 ||
        bm0x < h->mb.mv_min_spel[0] + 8 || bm1x < h->mb.mv_min_spel[0] + 8 ||
        bm0x > h->mb.mv_max_spel[0] - 8 || bm1x > h->mb.mv_max_spel[0] - 8 )
        return;

    if( rd && m0->i_pixel != PIXEL_16x16 && i8 != 0 )
    {
        x264_mb_predict_mv( h, 0, i8<<2, bw>>2, m0->mvp );
        x264_mb_predict_mv( h, 1, i8<<2, bw>>2, m1->mvp );
    }

    const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
    const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
    const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
    const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];

    h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) );

    for( int pass = 0; pass < 8; pass++ )
    {
        int bestj = 0;
        /* check all mv pairs that differ in at most 2 components from the current mvs. */
        /* doesn't do chroma ME. this probably doesn't matter, as the gains
         * from bidir ME are the same with and without chroma ME. */

        if( mc_list0 )
            for( int j = x264_iter_kludge; j < 9; j++ )
                BIME_CACHE( square1[j][0], square1[j][1], 0 );

        if( mc_list1 )
            for( int j = x264_iter_kludge; j < 9; j++ )
                BIME_CACHE( square1[j][0], square1[j][1], 1 );

        for( int j = !!pass; j < 33; j++ )
        {
            int m0x = dia4d[j][0] + bm0x;
            int m0y = dia4d[j][1] + bm0y;
            int m1x = dia4d[j][2] + bm1x;
            int m1y = dia4d[j][3] + bm1y;
            if( !pass || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) )
            {
                int i0 = 4 + 3*dia4d[j][0] + dia4d[j][1];
                int i1 = 4 + 3*dia4d[j][2] + dia4d[j][3];
                visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
                h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][0][i0], stride[0][0][i0], src[0][1][i1], stride[0][1][i1], i_weight );
                int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
                         + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
                if( rd )
                {
                    if( cost < SATD_THRESH(bcost) )
                    {
                        bcost = X264_MIN( cost, bcost );
                        M32( cache0_mv ) = pack16to32_mask(m0x,m0y);
                        M32( cache1_mv ) = pack16to32_mask(m1x,m1y);
                        if( CHROMA444 )
                        {
                            h->mc.avg[i_pixel]( pixu, FDEC_STRIDE, src[1][0][i0], stride[1][0][i0], src[1][1][i1], stride[1][1][i1], i_weight );
                            h->mc.avg[i_pixel]( pixv, FDEC_STRIDE, src[2][0][i0], stride[2][0][i0], src[2][1][i1], stride[2][1][i1], i_weight );
                        }
                        else
                        {
                            h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
                            h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
                        }
                        uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
                        COPY2_IF_LT( bcostrd, costrd, bestj, j );
                    }
                }
                else
                    COPY2_IF_LT( bcost, cost, bestj, j );
            }
        }

        if( !bestj )
            break;

        bm0x += dia4d[bestj][0];
        bm0y += dia4d[bestj][1];
        bm1x += dia4d[bestj][2];
        bm1y += dia4d[bestj][3];

        mc_list0 = M16( &dia4d[bestj][0] );
        mc_list1 = M16( &dia4d[bestj][2] );
    }

    if( rd )
    {
        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
        amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );

        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
        amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
    }

    m0->mv[0] = bm0x;
    m0->mv[1] = bm0y;
    m1->mv[0] = bm1x;
    m1->mv[1] = bm1y;
}

void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight )
{
    x264_me_refine_bidir( h, m0, m1, i_weight, 0, 0, 0 );
}

void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 )
{
    /* Motion compensation is done as part of bidir_rd; don't repeat
     * it in encoding. */
    h->mb.b_skip_mc = 1;
    x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 );
    h->mb.b_skip_mc = 0;
}

#undef COST_MV_SATD
#define COST_MV_SATD( mx, my, dst, avoid_mvp ) \
{ \
    if( !avoid_mvp || !(mx == pmx && my == pmy) ) \
    { \
        h->mc.mc_luma( pix, FDEC_STRIDE, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
        dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \
            + p_cost_mvx[mx] + p_cost_mvy[my]; \
        COPY1_IF_LT( bsatd, dst ); \
    } \
    else \
        dst = COST_MAX; \
}

#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
{ \
    if( satd <= SATD_THRESH(bsatd) ) \
    { \
        uint64_t cost; \
        M32( cache_mv ) = pack16to32_mask(mx,my); \
        if( CHROMA444 ) \
        { \
            h->mc.mc_luma( pixu, FDEC_STRIDE, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
            h->mc.mc_luma( pixv, FDEC_STRIDE, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
        } \
        else if( m->i_pixel <= PIXEL_8x8 ) \
        { \
            h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \
                             mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
            if( m->weight[1].weightfn ) \
                m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \
            if( m->weight[2].weightfn ) \
                m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \
        } \
        cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
        COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
    } \
}

void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list )
{
    int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
    const uint16_t *p_cost_mvx, *p_cost_mvy;
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
    const int i_pixel = m->i_pixel;
    int chroma_v_shift = CHROMA_V_SHIFT;
    int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;

    uint64_t bcost = COST_MAX64;
    int bmx = m->mv[0];
    int bmy = m->mv[1];
    int omx, omy, pmx, pmy;
    int satd, bsatd;
    int dir = -2;
    int i8 = i4>>2;
    uint16_t amvd;

    pixel *pix  = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
    pixel *pixu, *pixv;
    if( CHROMA444 )
    {
        pixu = &h->mb.pic.p_fdec[1][block_idx_xy_fdec[i4]];
        pixv = &h->mb.pic.p_fdec[2][block_idx_xy_fdec[i4]];
    }
    else
    {
        pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
        pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
    }

    h->mb.b_skip_mc = 1;

    if( m->i_pixel != PIXEL_16x16 && i4 != 0 )
        x264_mb_predict_mv( h, i_list, i4, bw>>2, m->mvp );
    pmx = m->mvp[0];
    pmy = m->mvp[1];
    p_cost_mvx = m->p_cost_mv - pmx;
    p_cost_mvy = m->p_cost_mv - pmy;
    COST_MV_SATD( bmx, bmy, bsatd, 0 );
    if( m->i_pixel != PIXEL_16x16 )
        COST_MV_RD( bmx, bmy, 0, 0, 0 )
    else
        bcost = m->cost;

    /* check the predicted mv */
    if( (bmx != pmx || bmy != pmy)
        && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0]
        && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
    {
        COST_MV_SATD( pmx, pmy, satd, 0 );
        COST_MV_RD  ( pmx, pmy, satd, 0, 0 );
        /* The hex motion search is guaranteed to not repeat the center candidate,
         * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */
        if( bmx == pmx && bmy == pmy )
        {
            pmx = m->mv[0];
            pmy = m->mv[1];
        }
    }

    if( bmy < h->mb.mv_min_spel[1] + 3 || bmy > h->mb.mv_max_spel[1] - 3 ||
        bmx < h->mb.mv_min_spel[0] + 3 || bmx > h->mb.mv_max_spel[0] - 3 )
    {
        h->mb.b_skip_mc = 0;
        return;
    }

    /* subpel hex search, same pattern as ME HEX. */
    dir = -2;
    omx = bmx;
    omy = bmy;
    for( int j = 0; j < 6; j++ )
    {
        COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1 );
        COST_MV_RD  ( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1, j );
    }

    if( dir != -2 )
    {
        /* half hexagon, not overlapping the previous iteration */
        for( int i = 1; i < 10; i++ )
        {
            const int odir = mod6m1[dir+1];
            if( bmy < h->mb.mv_min_spel[1] + 3 ||
                bmy > h->mb.mv_max_spel[1] - 3 )
                break;
            dir = -2;
            omx = bmx;
            omy = bmy;
            for( int j = 0; j < 3; j++ )
            {
                COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1 );
                COST_MV_RD  ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1, odir-1+j );
            }
            if( dir == -2 )
                break;
        }
    }

    /* square refine, same pattern as ME HEX. */
    omx = bmx;
    omy = bmy;
    for( int i = 0; i < 8; i++ )
    {
        COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satd, 1 );
        COST_MV_RD  ( omx + square1[i+1][0], omy + square1[i+1][1], satd, 0, 0 );
    }

    m->cost = bcost;
    m->mv[0] = bmx;
    m->mv[1] = bmy;
    x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
    amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),66), X264_MIN(abs(bmy - m->mvp[1]),66) );
    x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
    h->mb.b_skip_mc = 0;
}
x264-snapshot-20120103-2245-stable/encoder/macroblock.h0000644000175000001440000001641011700673342021577 0ustar  videolanusers/*****************************************************************************
 * macroblock.h: macroblock encoding
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ENCODER_MACROBLOCK_H
#define X264_ENCODER_MACROBLOCK_H

#include "common/macroblock.h"

extern const int x264_lambda2_tab[QP_MAX_MAX+1];
extern const uint16_t x264_lambda_tab[QP_MAX_MAX+1];

void x264_rdo_init( void );

int x264_macroblock_probe_skip( x264_t *h, int b_bidir );

#define x264_macroblock_probe_pskip( h )\
    x264_macroblock_probe_skip( h, 0 )
#define x264_macroblock_probe_bskip( h )\
    x264_macroblock_probe_skip( h, 1 )

void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] );
void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
void x264_predict_lossless_chroma( x264_t *h, int i_mode );

void x264_macroblock_encode      ( x264_t *h );
void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
void x264_macroblock_write_cavlc ( x264_t *h );

void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );

void x264_cabac_mb_skip( x264_t *h, int b_skip );

int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
                                int ctx_block_cat, int b_intra, int idx );
int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx );
int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );

void x264_noise_reduction_update( x264_t *h );

static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
    int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
    if( h->mb.b_noise_reduction )
        h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
    if( h->mb.b_trellis )
        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
    else
        return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
}

static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
    int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
    if( h->mb.b_noise_reduction )
        h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
    if( h->mb.b_trellis )
        return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
    else
        return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
}

#define STORE_8x8_NNZ( p, idx, nz )\
do\
{\
    M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
    M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
} while(0)

#define CLEAR_16x16_NNZ( p ) \
do\
{\
    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 0]] ) = 0;\
    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 2]] ) = 0;\
    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 8]] ) = 0;\
    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
} while(0)

static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict )
{
    int nz;
    pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
    pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
    ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );

    if( b_predict )
    {
        if( h->mb.b_lossless )
            x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
        else
            h->predict_4x4[i_mode]( p_dst );
    }

    if( h->mb.b_lossless )
    {
        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
        h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
        h->mb.i_cbp_luma |= nz<<(idx>>2);
        return;
    }

    h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );

    nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
    h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
    if( nz )
    {
        h->mb.i_cbp_luma |= 1<<(idx>>2);
        h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
        h->dctf.add4x4_idct( p_dst, dct4x4 );
    }
}

static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge, int b_predict )
{
    int x = idx&1;
    int y = idx>>1;
    int nz;
    pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
    pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
    ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
    ALIGNED_ARRAY_32( pixel, edge_buf,[36] );

    if( b_predict )
    {
        if( !edge )
        {
            h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
            edge = edge_buf;
        }

        if( h->mb.b_lossless )
            x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
        else
            h->predict_8x8[i_mode]( p_dst, edge );
    }

    if( h->mb.b_lossless )
    {
        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
        STORE_8x8_NNZ( p, idx, nz );
        h->mb.i_cbp_luma |= nz<<idx;
        return;
    }

    h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );

    nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
    if( nz )
    {
        h->mb.i_cbp_luma |= 1<<idx;
        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
        h->dctf.add8x8_idct8( p_dst, dct8x8 );
        STORE_8x8_NNZ( p, idx, 1 );
    }
    else
        STORE_8x8_NNZ( p, idx, 0 );
}

#endif

x264-snapshot-20120103-2245-stable/encoder/macroblock.c0000644000175000001440000015255611700673342021606 0ustar  videolanusers/*****************************************************************************
 * macroblock.c: macroblock encoding
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Henrik Gramner <hengar-6@student.ltu.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "macroblock.h"

/* These chroma DC functions don't have assembly versions and are only used here. */

#define ZIG(i,y,x) level[i] = dct[x*2+y];
static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
{
    ZIG(0,0,0)
    ZIG(1,0,1)
    ZIG(2,1,0)
    ZIG(3,1,1)
}
#undef ZIG

static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
{
    level[0] = dct[0];
    level[1] = dct[2];
    level[2] = dct[1];
    level[3] = dct[4];
    level[4] = dct[6];
    level[5] = dct[3];
    level[6] = dct[5];
    level[7] = dct[7];
}

#define IDCT_DEQUANT_2X2_START \
    int d0 = dct[0] + dct[1]; \
    int d1 = dct[2] + dct[3]; \
    int d2 = dct[0] - dct[1]; \
    int d3 = dct[2] - dct[3]; \
    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;

static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
{
    IDCT_DEQUANT_2X2_START
    dct4x4[0][0] = (d0 + d1) * dmf >> 5;
    dct4x4[1][0] = (d0 - d1) * dmf >> 5;
    dct4x4[2][0] = (d2 + d3) * dmf >> 5;
    dct4x4[3][0] = (d2 - d3) * dmf >> 5;
}

static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
{
    IDCT_DEQUANT_2X2_START
    dct[0] = (d0 + d1) * dmf >> 5;
    dct[1] = (d0 - d1) * dmf >> 5;
    dct[2] = (d2 + d3) * dmf >> 5;
    dct[3] = (d2 - d3) * dmf >> 5;
}
#undef IDCT_2X2_DEQUANT_START

static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
{
    int d0 = dct4x4[0][0] + dct4x4[1][0];
    int d1 = dct4x4[2][0] + dct4x4[3][0];
    int d2 = dct4x4[0][0] - dct4x4[1][0];
    int d3 = dct4x4[2][0] - dct4x4[3][0];
    d[0] = d0 + d1;
    d[2] = d2 + d3;
    d[1] = d0 - d1;
    d[3] = d2 - d3;
    dct4x4[0][0] = 0;
    dct4x4[1][0] = 0;
    dct4x4[2][0] = 0;
    dct4x4[3][0] = 0;
}

static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
{
    if( WORD_SIZE == 8 )
    {
        for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
            if( M64( &v[i] ) )
                return 1;
    }
    else
    {
        for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
            if( M32( &v[i] ) )
                return 1;
    }
    return 0;
}

/* All encoding functions must output the correct CBP and NNZ values.
 * The entropy coding functions will check CBP first, then NNZ, before
 * actually reading the DCT coefficients.  NNZ still must be correct even
 * if CBP is zero because of the use of NNZ values for context selection.
 * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
 * that is only needed in CAVLC, and will be calculated by CAVLC's residual
 * coding and stored as necessary. */

/* This means that decimation can be done merely by adjusting the CBP and NNZ
 * rather than memsetting the coefficients. */

static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
{
    pixel *p_src = h->mb.pic.p_fenc[p];
    pixel *p_dst = h->mb.pic.p_fdec[p];

    ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
    ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );

    int nz, block_cbp = 0;
    int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
    int i_quant_cat = p ? CQM_4IC : CQM_4IY;
    int i_mode = h->mb.i_intra16x16_pred_mode;

    if( h->mb.b_lossless )
        x264_predict_lossless_16x16( h, p, i_mode );
    else
        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );

    if( h->mb.b_lossless )
    {
        for( int i = 0; i < 16; i++ )
        {
            int oe = block_idx_xy_fenc[i];
            int od = block_idx_xy_fdec[i];
            nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
            h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
            block_cbp |= nz;
        }
        h->mb.i_cbp_luma |= block_cbp * 0xf;
        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
        h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
        return;
    }

    h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );

    for( int i = 0; i < 16; i++ )
    {
        /* copy dc coeff */
        if( h->mb.b_noise_reduction )
            h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 );
        dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
        dct4x4[i][0] = 0;

        /* quant/scan/dequant */
        if( h->mb.b_trellis )
            nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i );
        else
            nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
        h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
        if( nz )
        {
            h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] );
            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp );
            if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] );
            block_cbp = 0xf;
        }
    }

    /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
    /* More useful with CAVLC, but still useful with CABAC. */
    if( decimate_score < 6 )
    {
        CLEAR_16x16_NNZ( p );
        block_cbp = 0;
    }
    else
        h->mb.i_cbp_luma |= block_cbp;

    h->dctf.dct4x4dc( dct_dc4x4 );
    if( h->mb.b_trellis )
        nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
    else
        nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );

    h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
    if( nz )
    {
        h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );

        /* output samples to fdec */
        h->dctf.idct4x4dc( dct_dc4x4 );
        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp );  /* XXX not inversed */
        if( block_cbp )
            for( int i = 0; i < 16; i++ )
                dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
    }

    /* put pixels to fdec */
    if( block_cbp )
        h->dctf.add16x16_idct( p_dst, dct4x4 );
    else if( nz )
        h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
}

/* Round down coefficients losslessly in DC-only chroma blocks.
 * Unlike luma blocks, this can't be done with a lookup table or
 * other shortcut technique because of the interdependencies
 * between the coefficients due to the chroma DC transform. */
static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
{
    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;

    /* If the QP is too high, there's no benefit to rounding optimization. */
    if( dmf > 32*64 )
        return 1;

    if( chroma422 )
        return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
    else
        return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
}

static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
{
    int nz, nz_dc;
    int b_decimate = b_inter && h->mb.b_dct_decimate;
    int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
    ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
    h->mb.i_cbp_chroma = 0;
    h->nr_count[2] += h->mb.b_noise_reduction * 4;

    /* Early termination: check variance of chroma residual before encoding.
     * Don't bother trying early termination at low QPs.
     * Values are experimentally derived. */
    if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
    {
        int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
        int ssd[2];
        int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;

        int score  = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
        if( score < thresh*4 )
            score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
        if( score < thresh*4 )
        {
            M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
            M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
            M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
            M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
            if( chroma422 )
            {
                M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
                M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
                M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
                M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
            }
            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;

            for( int ch = 0; ch < 2; ch++ )
            {
                if( ssd[ch] > thresh )
                {
                    pixel *p_src = h->mb.pic.p_fenc[1+ch];
                    pixel *p_dst = h->mb.pic.p_fdec[1+ch];

                    if( chroma422 )
                        /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
                        h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
                    else
                        h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );

                    if( h->mb.b_trellis )
                        nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
                    else
                    {
                        nz_dc = 0;
                        for( int i = 0; i <= chroma422; i++ )
                            nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
                                                             h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
                    }

                    if( nz_dc )
                    {
                        if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
                            continue;
                        h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
                        if( chroma422 )
                        {
                            zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
                            h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
                        }
                        else
                        {
                            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
                            idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
                        }

                        for( int i = 0; i <= chroma422; i++ )
                            h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
                        h->mb.i_cbp_chroma = 1;
                    }
                }
            }
            return;
        }
    }

    for( int ch = 0; ch < 2; ch++ )
    {
        pixel *p_src = h->mb.pic.p_fenc[1+ch];
        pixel *p_dst = h->mb.pic.p_fdec[1+ch];
        int i_decimate_score = 0;
        int nz_ac = 0;

        ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );

        if( h->mb.b_lossless )
        {
            static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };

            for( int i = 0; i < (chroma422?8:4); i++ )
            {
                int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
                int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
                                           &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
                h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
                h->mb.i_cbp_chroma |= nz;
            }
            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
            continue;
        }

        for( int i = 0; i <= chroma422; i++ )
            h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );

        if( h->mb.b_noise_reduction )
            for( int i = 0; i < (chroma422?8:4); i++ )
                h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );

        if( chroma422 )
            h->dctf.dct2x4dc( dct_dc, dct4x4 );
        else
            dct2x2dc( dct_dc, dct4x4 );

        /* calculate dct coeffs */
        for( int i = 0; i < (chroma422?8:4); i++ )
        {
            if( h->mb.b_trellis )
                nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
            else
                nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
            h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
            if( nz )
            {
                nz_ac = 1;
                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
                h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
                if( b_decimate )
                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
            }
        }

        if( h->mb.b_trellis )
            nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
        else
        {
            nz_dc = 0;
            for( int i = 0; i <= chroma422; i++ )
                nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
                                                 h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
        }

        h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;

        if( (b_decimate && i_decimate_score < 7) || !nz_ac )
        {
            /* Decimate the block */
            M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
            M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
            if( chroma422 )
            {
                M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
                M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
            }

            if( !nz_dc ) /* Whole block is empty */
                continue;
            if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
            {
                h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
                continue;
            }
            /* DC-only */
            if( chroma422 )
            {
                zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
                h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
            }
            else
            {
                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
                idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
            }

            for( int i = 0; i <= chroma422; i++ )
                h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
        }
        else
        {
            h->mb.i_cbp_chroma = 1;

            if( nz_dc )
            {
                if( chroma422 )
                {
                    zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
                    h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
                }
                else
                {
                    zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
                    idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
                }
            }

            for( int i = 0; i <= chroma422; i++ )
                h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
        }
    }

    /* 0 = none, 1 = DC only, 2 = DC+AC */
    h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] |
                           h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
}

void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
{
    if( CHROMA_FORMAT == CHROMA_420 )
        x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
    else
        x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
}

static void x264_macroblock_encode_skip( x264_t *h )
{
    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
    if( CHROMA_FORMAT >= CHROMA_422 )
    {
        M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
        M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
        M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0;
        M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0;
    }
    h->mb.i_cbp_luma = 0;
    h->mb.i_cbp_chroma = 0;
    h->mb.cbp[h->mb.i_mb_xy] = 0;
}

/*****************************************************************************
 * Intra prediction for predictive lossless mode.
 *****************************************************************************/

void x264_predict_lossless_chroma( x264_t *h, int i_mode )
{
    int height = 16 >> CHROMA_V_SHIFT;
    if( i_mode == I_PRED_CHROMA_V )
    {
        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
        memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
        memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
    }
    else if( i_mode == I_PRED_CHROMA_H )
    {
        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
        x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
        x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
        if( CHROMA_FORMAT == CHROMA_422 )
        {
            x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
            x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
        }
    }
    else
    {
        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
    }
}

void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode )
{
    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
    pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;

    if( i_mode == I_PRED_4x4_V )
        h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
    else if( i_mode == I_PRED_4x4_H )
        h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
    else
        h->predict_4x4[i_mode]( p_dst );
}

void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] )
{
    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
    pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;

    if( i_mode == I_PRED_8x8_V )
        h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
    else if( i_mode == I_PRED_8x8_H )
        h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
    else
        h->predict_8x8[i_mode]( p_dst, edge );
}

void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
{
    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
    if( i_mode == I_PRED_16x16_V )
        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
    else if( i_mode == I_PRED_16x16_H )
        h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
    else
        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
}

/*****************************************************************************
 * x264_macroblock_encode:
 *****************************************************************************/
static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
{
    int i_qp = h->mb.i_qp;
    int b_decimate = h->mb.b_dct_decimate;
    int b_force_no_skip = 0;
    int nz;
    h->mb.i_cbp_luma = 0;
    for( int p = 0; p < plane_count; p++ )
        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;

    if( h->mb.i_type == I_PCM )
    {
        /* if PCM is chosen, we need to store reconstructed frame data */
        for( int p = 0; p < plane_count; p++ )
            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
        if( chroma )
        {
            int height = 16 >> CHROMA_V_SHIFT;
            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
        }
        return;
    }

    if( !h->mb.b_allow_skip )
    {
        b_force_no_skip = 1;
        if( IS_SKIP(h->mb.i_type) )
        {
            if( h->mb.i_type == P_SKIP )
                h->mb.i_type = P_L0;
            else if( h->mb.i_type == B_SKIP )
                h->mb.i_type = B_DIRECT;
        }
    }

    if( h->mb.i_type == P_SKIP )
    {
        /* don't do pskip motion compensation if it was already done in macroblock_analyse */
        if( !h->mb.b_skip_mc )
        {
            int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
                                  h->mb.mv_min[0], h->mb.mv_max[0] );
            int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
                                  h->mb.mv_min[1], h->mb.mv_max[1] );

            for( int p = 0; p < plane_count; p++ )
                h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
                               &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
                               mvx, mvy, 16, 16, &h->sh.weight[0][p] );

            if( chroma )
            {
                int v_shift = CHROMA_V_SHIFT;
                int height = 16 >> v_shift;

                /* Special case for mv0, which is (of course) very common in P-skip mode. */
                if( mvx | mvy )
                    h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                     h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
                                     mvx, 2*mvy>>v_shift, 8, height );
                else
                    h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
                                                         h->mb.pic.i_stride[1], height );

                if( h->sh.weight[0][1].weightfn )
                    h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
                                                       h->mb.pic.p_fdec[1], FDEC_STRIDE,
                                                       &h->sh.weight[0][1], height );
                if( h->sh.weight[0][2].weightfn )
                    h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                                       h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                                       &h->sh.weight[0][2], height );
            }
        }

        x264_macroblock_encode_skip( h );
        return;
    }
    if( h->mb.i_type == B_SKIP )
    {
        /* don't do bskip motion compensation if it was already done in macroblock_analyse */
        if( !h->mb.b_skip_mc )
            x264_mb_mc( h );
        x264_macroblock_encode_skip( h );
        return;
    }

    if( h->mb.i_type == I_16x16 )
    {
        h->mb.b_transform_8x8 = 0;

        for( int p = 0; p < plane_count; p++ )
        {
            x264_mb_encode_i16x16( h, p, i_qp );
            i_qp = h->mb.i_chroma_qp;
        }
    }
    else if( h->mb.i_type == I_8x8 )
    {
        h->mb.b_transform_8x8 = 1;
        /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
        if( h->mb.i_skip_intra )
        {
            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
            h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
            /* In RD mode, restore the now-overwritten DCT data. */
            if( h->mb.i_skip_intra == 2 )
                h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
        }
        for( int p = 0; p < plane_count; p++ )
        {
            for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
            {
                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
                x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
            }
            i_qp = h->mb.i_chroma_qp;
        }
    }
    else if( h->mb.i_type == I_4x4 )
    {
        h->mb.b_transform_8x8 = 0;
        /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
        if( h->mb.i_skip_intra )
        {
            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
            h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
            /* In RD mode, restore the now-overwritten DCT data. */
            if( h->mb.i_skip_intra == 2 )
                h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
        }
        for( int p = 0; p < plane_count; p++ )
        {
            for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
            {
                pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];

                if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                    /* emulate missing topright samples */
                    MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );

                x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
            }
            i_qp = h->mb.i_chroma_qp;
        }
    }
    else    /* Inter MB */
    {
        int i_decimate_mb = 0;

        /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
        if( !h->mb.b_skip_mc )
            x264_mb_mc( h );

        if( h->mb.b_lossless )
        {
            if( h->mb.b_transform_8x8 )
                for( int p = 0; p < plane_count; p++ )
                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                    {
                        int x = i8x8&1;
                        int y = i8x8>>1;
                        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
                                                                           h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
                        STORE_8x8_NNZ( p, i8x8, nz );
                        h->mb.i_cbp_luma |= nz << i8x8;
                    }
            else
                for( int p = 0; p < plane_count; p++ )
                    for( int i4x4 = 0; i4x4 < 16; i4x4++ )
                    {
                        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
                                                 h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
                                                 h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
                        h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
                        h->mb.i_cbp_luma |= nz << (i4x4>>2);
                    }
        }
        else if( h->mb.b_transform_8x8 )
        {
            ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
            b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC

            for( int p = 0; p < plane_count; p++ )
            {
                h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
                h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;

                int plane_cbp = 0;
                for( int idx = 0; idx < 4; idx++ )
                {
                    nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );

                    if( nz )
                    {
                        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
                        if( b_decimate )
                        {
                            int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
                            i_decimate_mb += i_decimate_8x8;
                            if( i_decimate_8x8 >= 4 )
                                plane_cbp |= 1<<idx;
                        }
                        else
                            plane_cbp |= 1<<idx;
                    }
                }

                if( i_decimate_mb < 6 && b_decimate )
                {
                    plane_cbp = 0;
                    CLEAR_16x16_NNZ( p );
                }
                else
                {
                    for( int idx = 0; idx < 4; idx++ )
                    {
                        int x = idx&1;
                        int y = idx>>1;

                        if( plane_cbp&(1<<idx) )
                        {
                            h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
                            h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
                            STORE_8x8_NNZ( p, idx, 1 );
                        }
                        else
                            STORE_8x8_NNZ( p, idx, 0 );
                    }
                }
                h->mb.i_cbp_luma |= plane_cbp;
                i_qp = h->mb.i_chroma_qp;
            }
        }
        else
        {
            ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
            for( int p = 0; p < plane_count; p++ )
            {
                h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
                h->nr_count[0+!!p*2] += h->mb.b_noise_reduction * 16;

                int plane_cbp = 0;
                for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                {
                    int i_decimate_8x8 = 0;
                    int cbp = 0;

                    /* encode one 4x4 block */
                    for( int i4x4 = 0; i4x4 < 4; i4x4++ )
                    {
                        int idx = i8x8 * 4 + i4x4;

                        nz = x264_quant_4x4( h, dct4x4[idx], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, idx );
                        h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;

                        if( nz )
                        {
                            h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
                            h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
                            if( b_decimate && i_decimate_8x8 < 6 )
                                i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
                            cbp = 1;
                        }
                    }

                    int x = i8x8&1;
                    int y = i8x8>>1;

                    /* decimate this 8x8 block */
                    i_decimate_mb += i_decimate_8x8;
                    if( b_decimate )
                    {
                        if( i_decimate_8x8 < 4 )
                            STORE_8x8_NNZ( p, i8x8, 0 );
                        else
                            plane_cbp |= 1<<i8x8;
                    }
                    else if( cbp )
                    {
                        h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
                        plane_cbp |= 1<<i8x8;
                    }
                }

                if( b_decimate )
                {
                    if( i_decimate_mb < 6 )
                    {
                        plane_cbp = 0;
                        CLEAR_16x16_NNZ( p );
                    }
                    else
                    {
                        for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                            if( plane_cbp&(1<<i8x8) )
                                h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
                    }
                }
                h->mb.i_cbp_luma |= plane_cbp;
                i_qp = h->mb.i_chroma_qp;
            }
        }
    }

    /* encode chroma */
    if( chroma )
    {
        if( IS_INTRA( h->mb.i_type ) )
        {
            int i_mode = h->mb.i_chroma_pred_mode;
            if( h->mb.b_lossless )
                x264_predict_lossless_chroma( h, i_mode );
            else
            {
                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
            }
        }

        /* encode the 8x8 blocks */
        x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
    }
    else
        h->mb.i_cbp_chroma = 0;

    /* store cbp */
    int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
    if( h->param.b_cabac )
        cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC    ]] << 8
            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
    h->mb.cbp[h->mb.i_mb_xy] = cbp;

    /* Check for P_SKIP
     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
     *      (if multiple mv give same result)*/
    if( !b_force_no_skip )
    {
        if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
            !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
            M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
            && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
        {
            h->mb.i_type = P_SKIP;
        }

        /* Check for B_SKIP */
        if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
        {
            h->mb.i_type = B_SKIP;
        }
    }
}

void x264_macroblock_encode( x264_t *h )
{
    if( CHROMA444 )
        x264_macroblock_encode_internal( h, 3, 0 );
    else
        x264_macroblock_encode_internal( h, 1, 1 );
}

/*****************************************************************************
 * x264_macroblock_probe_skip:
 *  Check if the current MB could be encoded as a [PB]_SKIP
 *****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
    ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
    ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
    ALIGNED_4( int16_t mvp[2] );
    int i_qp = h->mb.i_qp;

    for( int p = 0; p < plane_count; p++ )
    {
        int quant_cat = p ? CQM_4PC : CQM_4PY;
        if( !b_bidir )
        {
            /* Get the MV */
            mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
            mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );

            /* Motion compensation */
            h->mc.mc_luma( h->mb.pic.p_fdec[p],    FDEC_STRIDE,
                           &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
                           mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] );
        }

        for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
        {
            int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
            int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
            /* get luma diff */
            h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
                                        h->mb.pic.p_fdec[p] + fdec_offset );
            /* encode one 4x4 block */
            for( int i4x4 = 0; i4x4 < 4; i4x4++ )
            {
                if( h->mb.b_noise_reduction )
                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
                if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ) )
                    continue;
                h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
                i_decimate_mb += h->quantf.decimate_score16( dctscan );
                if( i_decimate_mb >= 6 )
                    return 0;
            }
        }
        i_qp = h->mb.i_chroma_qp;
    }

    if( chroma == CHROMA_420 || chroma == CHROMA_422 )
    {
        i_qp = h->mb.i_chroma_qp;
        int chroma422 = chroma == CHROMA_422;
        int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
        int ssd;
        ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );

        if( !b_bidir )
        {
            /* Special case for mv0, which is (of course) very common in P-skip mode. */
            if( M32( mvp ) )
                h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                 h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
                                 mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
            else
                h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
                                                     h->mb.pic.i_stride[1], chroma422?16:8 );
        }

        for( int ch = 0; ch < 2; ch++ )
        {
            pixel *p_src = h->mb.pic.p_fenc[1+ch];
            pixel *p_dst = h->mb.pic.p_fdec[1+ch];

            if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
                h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
                                                      h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
                                                      &h->sh.weight[0][1+ch], chroma422?16:8 );

            /* there is almost never a termination during chroma, but we can't avoid the check entirely */
            /* so instead we check SSD and skip the actual check if the score is low enough. */
            ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
            if( ssd < thresh )
                continue;

            /* The vast majority of chroma checks will terminate during the DC check or the higher
             * threshold check, so we can save time by doing a DC-only DCT. */
            if( h->mb.b_noise_reduction )
            {
                for( int i = 0; i <= chroma422; i++ )
                    h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );

                for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
                {
                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
                    dct_dc[i4x4] = dct4x4[i4x4][0];
                }
            }
            else
            {
                if( chroma422 )
                    h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
                else
                    h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
            }

            for( int i = 0; i <= chroma422; i++ )
                if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
                                            h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
                    return 0;

            /* If there wasn't a termination in DC, we can check against a much higher threshold. */
            if( ssd < thresh*4 )
                continue;

            if( !h->mb.b_noise_reduction )
               for( int i = 0; i <= chroma422; i++ )
                    h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );

            /* calculate dct coeffs */
            for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ )
            {
                dct4x4[i4x4][0] = 0;
                if( h->mb.b_noise_reduction )
                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
                if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
                    continue;
                h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
                i_decimate_mb += h->quantf.decimate_score15( dctscan );
                if( i_decimate_mb >= 7 )
                    return 0;
            }
        }
    }

    h->mb.b_skip_mc = 1;
    return 1;
}

int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
{
    if( CHROMA_FORMAT == CHROMA_444 )
        return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
    else if( CHROMA_FORMAT == CHROMA_422 )
        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
    else
        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
}

/****************************************************************************
 * DCT-domain noise reduction / adaptive deadzone
 * from libavcodec
 ****************************************************************************/

void x264_noise_reduction_update( x264_t *h )
{
    h->nr_offset = h->nr_offset_denoise;
    h->nr_residual_sum = h->nr_residual_sum_buf[0];
    h->nr_count = h->nr_count_buf[0];
    for( int cat = 0; cat < 3 + CHROMA444; cat++ )
    {
        int dct8x8 = cat&1;
        int size = dct8x8 ? 64 : 16;
        const uint16_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;

        if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
        {
            for( int i = 0; i < size; i++ )
                h->nr_residual_sum[cat][i] >>= 1;
            h->nr_count[cat] >>= 1;
        }

        for( int i = 0; i < size; i++ )
            h->nr_offset[cat][i] =
                ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
                 + h->nr_residual_sum[cat][i]/2)
              / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);

        /* Don't denoise DC coefficients */
        h->nr_offset[cat][0] = 0;
    }
}

/*****************************************************************************
 * RD only; 4 calls to this do not make up for one macroblock_encode.
 * doesn't transform chroma dc.
 *****************************************************************************/
static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma )
{
    int b_decimate = h->mb.b_dct_decimate;
    int i_qp = h->mb.i_qp;
    int x = i8&1;
    int y = i8>>1;
    int nz;
    int chroma422 = chroma == CHROMA_422;

    h->mb.i_cbp_chroma = 0;
    h->mb.i_cbp_luma &= ~(1 << i8);

    if( !h->mb.b_skip_mc )
        x264_mb_mc_8x8( h, i8 );

    if( h->mb.b_lossless )
    {
        for( int p = 0; p < plane_count; p++ )
        {
            pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
            pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
            int nnz8x8 = 0;
            if( h->mb.b_transform_8x8 )
            {
                nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec );
                STORE_8x8_NNZ( p, i8, nnz8x8 );
            }
            else
            {
                for( int i4 = i8*4; i4 < i8*4+4; i4++ )
                {
                    nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4],
                                             h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4],
                                             h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] );
                    h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz;
                    nnz8x8 |= nz;
                }
            }
            h->mb.i_cbp_luma |= nnz8x8 << i8;
        }
        if( chroma == CHROMA_420 || chroma == CHROMA_422 )
        {
            for( int ch = 0; ch < 2; ch++ )
            {
                dctcoef dc;
                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;

                for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
                {
                    int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
                    nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
                    h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
                }
            }
            h->mb.i_cbp_chroma = 0x02;
        }
    }
    else
    {
        if( h->mb.b_transform_8x8 )
        {
            for( int p = 0; p < plane_count; p++ )
            {
                int quant_cat = p ? CQM_8PC : CQM_8PY;
                pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
                pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
                ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
                h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
                int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
                if( nnz8x8 )
                {
                    h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 );

                    if( b_decimate && !h->mb.b_trellis )
                        nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] );

                    if( nnz8x8 )
                    {
                        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
                        h->dctf.add8x8_idct8( p_fdec, dct8x8 );
                        STORE_8x8_NNZ( p, i8, 1 );
                    }
                    else
                        STORE_8x8_NNZ( p, i8, 0 );
                }
                else
                    STORE_8x8_NNZ( p, i8, 0 );
                h->mb.i_cbp_luma |= nnz8x8 << i8;
                i_qp = h->mb.i_chroma_qp;
            }
        }
        else
        {
            for( int p = 0; p < plane_count; p++ )
            {
                int quant_cat = p ? CQM_4PC : CQM_4PY;
                pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
                pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
                int i_decimate_8x8 = 0, nnz8x8 = 0;
                ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
                h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
                for( int i4 = 0; i4 < 4; i4++ )
                {
                    nz = x264_quant_4x4( h, dct4x4[i4], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i8*4+i4 );
                    h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4]] = nz;
                    if( nz )
                    {
                        h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4], dct4x4[i4] );
                        h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[quant_cat], i_qp );
                        if( b_decimate )
                            i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4] );
                        nnz8x8 = 1;
                    }
                }

                if( b_decimate && i_decimate_8x8 < 4 )
                    nnz8x8 = 0;

                if( nnz8x8 )
                    h->dctf.add8x8_idct( p_fdec, dct4x4 );
                else
                    STORE_8x8_NNZ( p, i8, 0 );

                h->mb.i_cbp_luma |= nnz8x8 << i8;
                i_qp = h->mb.i_chroma_qp;
            }
        }

        if( chroma == CHROMA_420 || chroma == CHROMA_422 )
        {
            i_qp = h->mb.i_chroma_qp;
            for( int ch = 0; ch < 2; ch++ )
            {
                ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] );
                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;

                for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
                {
                    h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );

                    if( h->mb.b_noise_reduction )
                        h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
                    dct4x4[i4x4][0] = 0;

                    if( h->mb.b_trellis )
                        nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
                    else
                        nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );

                    int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
                    h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
                    if( nz )
                    {
                        h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
                        h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
                        h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
                    }
                }
            }
            h->mb.i_cbp_chroma = 0x02;
        }
    }
}

void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
    if( CHROMA444 )
        x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
    else if( CHROMA_FORMAT == CHROMA_422 )
        x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
    else
        x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
}

/*****************************************************************************
 * RD only, luma only (for 4:2:0)
 *****************************************************************************/
static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count )
{
    int i_qp = h->mb.i_qp;

    for( int p = 0; p < plane_count; p++ )
    {
        int quant_cat = p ? CQM_4PC : CQM_4PY;
        pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
        pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]];
        int nz;

        /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */

        if( h->mb.b_lossless )
        {
            nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec );
            h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
        }
        else
        {
            ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
            h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
            nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
            h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
            if( nz )
            {
                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 );
                h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp );
                h->dctf.add4x4_idct( p_fdec, dct4x4 );
            }
        }
        i_qp = h->mb.i_chroma_qp;
    }
}

void x264_macroblock_encode_p4x4( x264_t *h, int i8 )
{
    if( CHROMA444 )
        x264_macroblock_encode_p4x4_internal( h, i8, 3 );
    else
        x264_macroblock_encode_p4x4_internal( h, i8, 1 );
}
x264-snapshot-20120103-2245-stable/encoder/lookahead.c0000644000175000001440000002313711700673342021411 0ustar  videolanusers/*****************************************************************************
 * lookahead.c: high-level lookahead functions
 *****************************************************************************
 * Copyright (C) 2010-2011 Avail Media and x264 project
 *
 * Authors: Michael Kazmier <mkazmier@availmedia.com>
 *          Alex Giladi <agiladi@availmedia.com>
 *          Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

/* LOOKAHEAD (threaded and non-threaded mode)
 *
 * Lookahead types:
 *     [1] Slice type / scene cut;
 *
 * In non-threaded mode, we run the existing slicetype decision code as it was.
 * In threaded mode, we run in a separate thread, that lives between the calls
 * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for
 * the number of frames specified in rc_lookahead.  Recommended setting is
 * # of bframes + # of threads.
 */
#include "common/common.h"
#include "analyse.h"

static void x264_lookahead_shift( x264_sync_frame_list_t *dst, x264_sync_frame_list_t *src, int count )
{
    int i = count;
    while( i-- )
    {
        assert( dst->i_size < dst->i_max_size );
        assert( src->i_size );
        dst->list[ dst->i_size++ ] = x264_frame_shift( src->list );
        src->i_size--;
    }
    if( count )
    {
        x264_pthread_cond_broadcast( &dst->cv_fill );
        x264_pthread_cond_broadcast( &src->cv_empty );
    }
}

static void x264_lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb )
{
    if( h->lookahead->last_nonb )
        x264_frame_push_unused( h, h->lookahead->last_nonb );
    h->lookahead->last_nonb = new_nonb;
    new_nonb->i_reference_count++;
}

#if HAVE_THREAD
static void x264_lookahead_slicetype_decide( x264_t *h )
{
    x264_stack_align( x264_slicetype_decide, h );

    x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );

    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
    while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
        x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );

    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );

    /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
    if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
        x264_stack_align( x264_slicetype_analyse, h, 1 );

    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
}

static void x264_lookahead_thread( x264_t *h )
{
    int shift;
#if HAVE_MMX
    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
        x264_cpu_mask_misalign_sse();
#endif
    while( !h->lookahead->b_exit_thread )
    {
        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
        x264_pthread_mutex_lock( &h->lookahead->next.mutex );
        shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
        x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
        x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
        if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length + h->param.b_vfr_input )
        {
            while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread )
                x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex );
            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
        }
        else
        {
            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
            x264_lookahead_slicetype_decide( h );
        }
    }   /* end of input frames */
    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
    x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size );
    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
    while( h->lookahead->next.i_size )
        x264_lookahead_slicetype_decide( h );
    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
    h->lookahead->b_thread_active = 0;
    x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_fill );
    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
}
#endif

int x264_lookahead_init( x264_t *h, int i_slicetype_length )
{
    x264_lookahead_t *look;
    CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) );
    for( int i = 0; i < h->param.i_threads; i++ )
        h->thread[i]->lookahead = look;

    look->i_last_keyframe = - h->param.i_keyint_max;
    look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead))
                               && !h->param.rc.b_stat_read;
    look->i_slicetype_length = i_slicetype_length;

    /* init frame lists */
    if( x264_sync_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) ||
        x264_sync_frame_list_init( &look->next, h->frames.i_delay+3 ) ||
        x264_sync_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) )
        goto fail;

    if( !h->param.i_sync_lookahead )
        return 0;

    x264_t *look_h = h->thread[h->param.i_threads];
    *look_h = *h;
    if( x264_macroblock_cache_allocate( look_h ) )
        goto fail;

    if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
        goto fail;

    if( x264_pthread_create( &look->thread_handle, NULL, (void*)x264_lookahead_thread, look_h ) )
        goto fail;
    look->b_thread_active = 1;

    return 0;
fail:
    x264_free( look );
    return -1;
}

void x264_lookahead_delete( x264_t *h )
{
    if( h->param.i_sync_lookahead )
    {
        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
        h->lookahead->b_exit_thread = 1;
        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
        x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
        x264_pthread_join( h->lookahead->thread_handle, NULL );
        x264_macroblock_cache_free( h->thread[h->param.i_threads] );
        x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
        x264_free( h->thread[h->param.i_threads] );
    }
    x264_sync_frame_list_delete( &h->lookahead->ifbuf );
    x264_sync_frame_list_delete( &h->lookahead->next );
    if( h->lookahead->last_nonb )
        x264_frame_push_unused( h, h->lookahead->last_nonb );
    x264_sync_frame_list_delete( &h->lookahead->ofbuf );
    x264_free( h->lookahead );
}

void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame )
{
    if( h->param.i_sync_lookahead )
        x264_sync_frame_list_push( &h->lookahead->ifbuf, frame );
    else
        x264_sync_frame_list_push( &h->lookahead->next, frame );
}

int x264_lookahead_is_empty( x264_t *h )
{
    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
    int b_empty = !h->lookahead->next.i_size && !h->lookahead->ofbuf.i_size;
    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
    return b_empty;
}

static void x264_lookahead_encoder_shift( x264_t *h )
{
    if( !h->lookahead->ofbuf.i_size )
        return;
    int i_frames = h->lookahead->ofbuf.list[0]->i_bframes + 1;
    while( i_frames-- )
    {
        x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) );
        h->lookahead->ofbuf.i_size--;
    }
    x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty );
}

void x264_lookahead_get_frames( x264_t *h )
{
    if( h->param.i_sync_lookahead )
    {   /* We have a lookahead thread, so get frames from there */
        x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
        while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active )
            x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
        x264_lookahead_encoder_shift( h );
        x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
    }
    else
    {   /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */

        if( h->frames.current[0] || !h->lookahead->next.i_size )
            return;

        x264_stack_align( x264_slicetype_decide, h );
        x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );

        /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
        if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
            x264_stack_align( x264_slicetype_analyse, h, 1 );

        x264_lookahead_encoder_shift( h );
    }
}
x264-snapshot-20120103-2245-stable/encoder/cavlc.c0000644000175000001440000006350011700673342020550 0ustar  videolanusers/*****************************************************************************
 * cavlc.c: cavlc bitstream writing
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "macroblock.h"

#ifndef RDO_SKIP_BS
#define RDO_SKIP_BS 0
#endif

/* [400,420][inter,intra] */
static const uint8_t cbp_to_golomb[2][2][48] =
{
    {{ 0,  1,  2,  5,  3,  6, 14, 10,  4, 15,  7, 11,  8, 12, 13,  9 },
     { 1, 10, 11,  6, 12,  7, 14,  2, 13, 15,  8,  3,  9,  4,  5,  0 }},
    {{ 0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
       1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
       6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 },
     { 3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
      16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
      41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0 }}
};

static const uint8_t mb_type_b_to_golomb[3][9]=
{
    { 4,  8, 12, 10,  6, 14, 16, 18, 20 }, /* D_16x8 */
    { 5,  9, 13, 11,  7, 15, 17, 19, 21 }, /* D_8x16 */
    { 1, -1, -1, -1,  2, -1, -1, -1,  3 }  /* D_16x16 */
};

static const uint8_t subpartition_p_to_golomb[4]=
{
    3, 1, 2, 0
};

static const uint8_t subpartition_b_to_golomb[13]=
{
    10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0
};

#define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits )

/****************************************************************************
 * x264_cavlc_block_residual:
 ****************************************************************************/
static inline int x264_cavlc_block_residual_escape( x264_t *h, int i_suffix_length, int level )
{
    bs_t *s = &h->out.bs;
    static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
    int i_level_prefix = 15;
    int mask = level >> 31;
    int abs_level = (level^mask)-mask;
    int i_level_code = abs_level*2-mask-2;
    if( ( i_level_code >> i_suffix_length ) < 15 )
    {
        bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
                 (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
    }
    else
    {
        i_level_code -= 15 << i_suffix_length;
        if( i_suffix_length == 0 )
            i_level_code -= 15;

        /* If the prefix size exceeds 15, High Profile is required. */
        if( i_level_code >= 1<<12 )
        {
            if( h->sps->i_profile_idc >= PROFILE_HIGH )
            {
                while( i_level_code > 1<<(i_level_prefix-3) )
                {
                    i_level_code -= 1<<(i_level_prefix-3);
                    i_level_prefix++;
                }
            }
            else
            {
#if RDO_SKIP_BS
                /* Weight highly against overflows. */
                s->i_bits_encoded += 2000;
#else
                /* We've had an overflow; note it down and re-encode the MB later. */
                h->mb.b_overflow = 1;
#endif
            }
        }
        bs_write( s, i_level_prefix + 1, 1 );
        bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
    }
    if( i_suffix_length == 0 )
        i_suffix_length++;
    if( abs_level > next_suffix[i_suffix_length] )
        i_suffix_length++;
    return i_suffix_length;
}

static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dctcoef *l, int nC )
{
    bs_t *s = &h->out.bs;
    static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
    static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
    x264_run_level_t runlevel;
    int i_total, i_trailing, i_total_zero, i_suffix_length;
    unsigned int i_sign;

    /* level and run and total */
    /* set these to 2 to allow branchless i_trailing calculation */
    runlevel.level[1] = 2;
    runlevel.level[2] = 2;
    i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
    i_total_zero = runlevel.last + 1 - i_total;

    i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
               | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
               | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
    i_trailing = ctz_index[i_trailing];
    i_sign = ((runlevel.level[2] >> 31) & 1)
           | ((runlevel.level[1] >> 31) & 2)
           | ((runlevel.level[0] >> 31) & 4);
    i_sign >>= 3-i_trailing;

    /* total/trailing */
    bs_write_vlc( s, x264_coeff_token[nC][i_total-1][i_trailing] );

    i_suffix_length = i_total > 10 && i_trailing < 3;
    bs_write( s, i_trailing, i_sign );

    if( i_trailing < i_total )
    {
        int val = runlevel.level[i_trailing];
        int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
        val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
        val += LEVEL_TABLE_SIZE/2;

        if( (unsigned)val_original < LEVEL_TABLE_SIZE )
        {
            bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
            i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
        }
        else
            i_suffix_length = x264_cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
        for( int i = i_trailing+1; i < i_total; i++ )
        {
            val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
            if( (unsigned)val < LEVEL_TABLE_SIZE )
            {
                bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
                i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
            }
            else
                i_suffix_length = x264_cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
        }
    }

    if( ctx_block_cat == DCT_CHROMA_DC )
    {
        if( i_total < 8>>CHROMA_V_SHIFT )
        {
            vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero]
                                                            : x264_total_zeros_2x4_dc[i_total-1][i_total_zero];
            bs_write_vlc( s, total_zeros );
        }
    }
    else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
        bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );

    for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
    {
        int i_zl = X264_MIN( i_total_zero, 7 );
        bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
        i_total_zero -= runlevel.run[i];
    }

    return i_total;
}

static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};

#define x264_cavlc_block_residual(h,cat,idx,l)\
{\
    int nC = cat == DCT_CHROMA_DC ? 5 - CHROMA_V_SHIFT\
                                  : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
    uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
    if( !*nnz )\
        bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
    else\
        *nnz = x264_cavlc_block_residual_internal(h,cat,l,nC);\
}

static void x264_cavlc_qp_delta( x264_t *h )
{
    bs_t *s = &h->out.bs;
    int i_dqp = h->mb.i_qp - h->mb.i_last_qp;

    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
    if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
        && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]]
        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]]
        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] )
    {
#if !RDO_SKIP_BS
        h->mb.i_qp = h->mb.i_last_qp;
#endif
        i_dqp = 0;
    }

    if( i_dqp )
    {
        if( i_dqp < -(QP_MAX_SPEC+1)/2 )
            i_dqp += QP_MAX_SPEC+1;
        else if( i_dqp > QP_MAX_SPEC/2 )
            i_dqp -= QP_MAX_SPEC+1;
    }
    bs_write_se( s, i_dqp );
}

static void x264_cavlc_mvd( x264_t *h, int i_list, int idx, int width )
{
    bs_t *s = &h->out.bs;
    ALIGNED_4( int16_t mvp[2] );
    x264_mb_predict_mv( h, i_list, idx, width, mvp );
    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
}

static inline void x264_cavlc_8x8_mvd( x264_t *h, int i )
{
    switch( h->mb.i_sub_partition[i] )
    {
        case D_L0_8x8:
            x264_cavlc_mvd( h, 0, 4*i, 2 );
            break;
        case D_L0_8x4:
            x264_cavlc_mvd( h, 0, 4*i+0, 2 );
            x264_cavlc_mvd( h, 0, 4*i+2, 2 );
            break;
        case D_L0_4x8:
            x264_cavlc_mvd( h, 0, 4*i+0, 1 );
            x264_cavlc_mvd( h, 0, 4*i+1, 1 );
            break;
        case D_L0_4x4:
            x264_cavlc_mvd( h, 0, 4*i+0, 1 );
            x264_cavlc_mvd( h, 0, 4*i+1, 1 );
            x264_cavlc_mvd( h, 0, 4*i+2, 1 );
            x264_cavlc_mvd( h, 0, 4*i+3, 1 );
            break;
    }
}

static inline void x264_cavlc_macroblock_luma_residual( x264_t *h, int i8start, int i8end )
{
    if( h->mb.b_transform_8x8 )
    {
        /* shuffle 8x8 dct coeffs into 4x4 lists */
        for( int i8 = i8start; i8 <= i8end; i8++ )
            if( h->mb.cache.non_zero_count[x264_scan8[i8*4]] )
                h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
    }

    for( int i8 = i8start; i8 <= i8end; i8++ )
        if( h->mb.i_cbp_luma & (1 << (i8&3)) )
            for( int i4 = 0; i4 < 4; i4++ )
                x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
}

static void x264_cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma )
{
    bs_t *s = &h->out.bs;
    if( i_mb_type == I_16x16 )
    {
        bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
                        h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
    }
    else //if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
    {
        int di = i_mb_type == I_8x8 ? 4 : 1;
        bs_write_ue( s, i_mb_i_offset + 0 );
        if( h->pps->b_transform_8x8_mode )
            bs_write1( s, h->mb.b_transform_8x8 );

        /* Prediction: Luma */
        for( int i = 0; i < 16; i += di )
        {
            int i_pred = x264_mb_predict_intra4x4_mode( h, i );
            int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );

            if( i_pred == i_mode )
                bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
            else
                bs_write( s, 4, i_mode - (i_mode > i_pred) );
        }

    }
    if( chroma )
        bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
}

static ALWAYS_INLINE void x264_cavlc_mb_header_p( x264_t *h, int i_mb_type, int chroma )
{
    bs_t *s = &h->out.bs;
    if( i_mb_type == P_L0 )
    {
        if( h->mb.i_partition == D_16x16 )
        {
            bs_write1( s, 1 );

            if( h->mb.pic.i_fref[0] > 1 )
                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
            x264_cavlc_mvd( h, 0, 0, 4 );
        }
        else if( h->mb.i_partition == D_16x8 )
        {
            bs_write_ue( s, 1 );
            if( h->mb.pic.i_fref[0] > 1 )
            {
                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
            }
            x264_cavlc_mvd( h, 0, 0, 4 );
            x264_cavlc_mvd( h, 0, 8, 4 );
        }
        else if( h->mb.i_partition == D_8x16 )
        {
            bs_write_ue( s, 2 );
            if( h->mb.pic.i_fref[0] > 1 )
            {
                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
            }
            x264_cavlc_mvd( h, 0, 0, 2 );
            x264_cavlc_mvd( h, 0, 4, 2 );
        }
    }
    else if( i_mb_type == P_8x8 )
    {
        int b_sub_ref;
        if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
             h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
        {
            bs_write_ue( s, 4 );
            b_sub_ref = 0;
        }
        else
        {
            bs_write_ue( s, 3 );
            b_sub_ref = 1;
        }

        /* sub mb type */
        if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
            for( int i = 0; i < 4; i++ )
                bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i] ] );
        else
            bs_write( s, 4, 0xf );

        /* ref0 */
        if( b_sub_ref )
        {
            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
        }

        for( int i = 0; i < 4; i++ )
            x264_cavlc_8x8_mvd( h, i );
    }
    else //if( IS_INTRA( i_mb_type ) )
        x264_cavlc_mb_header_i( h, i_mb_type, 5, chroma );
}

static ALWAYS_INLINE void x264_cavlc_mb_header_b( x264_t *h, int i_mb_type, int chroma )
{
    bs_t *s = &h->out.bs;
    if( i_mb_type == B_8x8 )
    {
        bs_write_ue( s, 22 );

        /* sub mb type */
        for( int i = 0; i < 4; i++ )
            bs_write_ue( s, subpartition_b_to_golomb[ h->mb.i_sub_partition[i] ] );

        /* ref */
        if( h->mb.pic.i_fref[0] > 1 )
            for( int i = 0; i < 4; i++ )
                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
                    bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
        if( h->mb.pic.i_fref[1] > 1 )
            for( int i = 0; i < 4; i++ )
                if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
                    bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );

        /* mvd */
        for( int i = 0; i < 4; i++ )
            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
                x264_cavlc_mvd( h, 0, 4*i, 2 );
        for( int i = 0; i < 4; i++ )
            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
                x264_cavlc_mvd( h, 1, 4*i, 2 );
    }
    else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI )
    {
        /* All B mode */
        /* Motion Vector */
        const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
        const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
        const int i_ref1_max = h->mb.pic.i_fref[1] - 1;

        bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
        if( h->mb.i_partition == D_16x16 )
        {
            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
            if( b_list[0][0] ) x264_cavlc_mvd( h, 0, 0, 4 );
            if( b_list[1][0] ) x264_cavlc_mvd( h, 1, 0, 4 );
        }
        else
        {
            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
            if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
            if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
            if( h->mb.i_partition == D_16x8 )
            {
                if( b_list[0][0] ) x264_cavlc_mvd( h, 0, 0, 4 );
                if( b_list[0][1] ) x264_cavlc_mvd( h, 0, 8, 4 );
                if( b_list[1][0] ) x264_cavlc_mvd( h, 1, 0, 4 );
                if( b_list[1][1] ) x264_cavlc_mvd( h, 1, 8, 4 );
            }
            else //if( h->mb.i_partition == D_8x16 )
            {
                if( b_list[0][0] ) x264_cavlc_mvd( h, 0, 0, 2 );
                if( b_list[0][1] ) x264_cavlc_mvd( h, 0, 4, 2 );
                if( b_list[1][0] ) x264_cavlc_mvd( h, 1, 0, 2 );
                if( b_list[1][1] ) x264_cavlc_mvd( h, 1, 4, 2 );
            }
        }
    }
    else if( i_mb_type == B_DIRECT )
        bs_write1( s, 1 );
    else //if( IS_INTRA( i_mb_type ) )
        x264_cavlc_mb_header_i( h, i_mb_type, 23, chroma );
}

/*****************************************************************************
 * x264_macroblock_write:
 *****************************************************************************/
void x264_macroblock_write_cavlc( x264_t *h )
{
    bs_t *s = &h->out.bs;
    const int i_mb_type = h->mb.i_type;
    int plane_count = CHROMA444 ? 3 : 1;
    int chroma = !CHROMA444;

#if RDO_SKIP_BS
    s->i_bits_encoded = 0;
#else
    const int i_mb_pos_start = bs_pos( s );
    int       i_mb_pos_tex;
#endif

    if( SLICE_MBAFF
        && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
    {
        bs_write1( s, MB_INTERLACED );
    }

#if !RDO_SKIP_BS
    if( i_mb_type == I_PCM )
    {
        static const uint8_t i_offsets[3] = {5,23,0};
        uint8_t *p_start = s->p_start;
        bs_write_ue( s, i_offsets[h->sh.i_type] + 25 );
        i_mb_pos_tex = bs_pos( s );
        h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;

        bs_align_0( s );

        for( int p = 0; p < plane_count; p++ )
            for( int i = 0; i < 256; i++ )
                bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
        if( chroma )
            for( int ch = 1; ch < 3; ch++ )
                for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )
                    for( int j = 0; j < 8; j++ )
                        bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );

        bs_init( s, s->p, s->p_end - s->p );
        s->p_start = p_start;

        h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
        return;
    }
#endif

    if( h->sh.i_type == SLICE_TYPE_P )
        x264_cavlc_mb_header_p( h, i_mb_type, chroma );
    else if( h->sh.i_type == SLICE_TYPE_B )
        x264_cavlc_mb_header_b( h, i_mb_type, chroma );
    else //if( h->sh.i_type == SLICE_TYPE_I )
        x264_cavlc_mb_header_i( h, i_mb_type, 0, chroma );

#if !RDO_SKIP_BS
    i_mb_pos_tex = bs_pos( s );
    h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
#endif

    /* Coded block pattern */
    if( i_mb_type != I_16x16 )
        bs_write_ue( s, cbp_to_golomb[chroma][IS_INTRA(i_mb_type)][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );

    /* transform size 8x8 flag */
    if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
        bs_write1( s, h->mb.b_transform_8x8 );

    if( i_mb_type == I_16x16 )
    {
        x264_cavlc_qp_delta( h );

        /* DC Luma */
        for( int p = 0; p < plane_count; p++ )
        {
            x264_cavlc_block_residual( h, DCT_LUMA_DC, LUMA_DC+p, h->dct.luma16x16_dc[p] );

            /* AC Luma */
            if( h->mb.i_cbp_luma )
                for( int i = p*16; i < p*16+16; i++ )
                    x264_cavlc_block_residual( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
        }
    }
    else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
    {
        x264_cavlc_qp_delta( h );
        x264_cavlc_macroblock_luma_residual( h, 0, plane_count*4-1 );
    }
    if( h->mb.i_cbp_chroma )
    {
        /* Chroma DC residual present */
        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
        if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
        {
            int step = 8 << CHROMA_V_SHIFT;
            for( int i = 16; i < 3*16; i += step )
                for( int j = i; j < i+4; j++ )
                    x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
        }
    }

#if !RDO_SKIP_BS
    h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
#endif
}

#if RDO_SKIP_BS
/*****************************************************************************
 * RD only; doesn't generate a valid bitstream
 * doesn't write cbp or chroma dc (I don't know how much this matters)
 * doesn't write ref (never varies between calls, so no point in doing so)
 * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
 * works on all partition sizes except 16x16
 *****************************************************************************/
static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
{
    bs_t *s = &h->out.bs;
    const int i_mb_type = h->mb.i_type;
    int b_8x16 = h->mb.i_partition == D_8x16;
    int plane_count = CHROMA444 ? 3 : 1;
    int j;

    if( i_mb_type == P_8x8 )
    {
        x264_cavlc_8x8_mvd( h, i8 );
        bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
    }
    else if( i_mb_type == P_L0 )
        x264_cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 );
    else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
    {
        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 );
        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mvd( h, 1, 4*i8, 4>>b_8x16 );
    }
    else //if( i_mb_type == B_8x8 )
    {
        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
            x264_cavlc_mvd( h, 0, 4*i8, 2 );
        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
            x264_cavlc_mvd( h, 1, 4*i8, 2 );
    }

    for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
    {
        for( int p = 0; p < plane_count; p++ )
            x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 );
        if( h->mb.i_cbp_chroma )
        {
            if( CHROMA_FORMAT == CHROMA_422 )
            {
                int offset = (5*i8) & 0x09;
                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 );
                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 );
                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 );
                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 );
            }
            else
            {
                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
            }
        }
        i8 += x264_pixel_size[i_pixel].h >> 3;
    }

    return h->out.bs.i_bits_encoded;
}

static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
{
    int plane_count = CHROMA444 ? 3 : 1;
    int b_8x4 = i_pixel == PIXEL_8x4;
    h->out.bs.i_bits_encoded = 0;
    x264_cavlc_mvd( h, 0, i4, 1+b_8x4 );
    for( int p = 0; p < plane_count; p++ )
    {
        x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
        if( i_pixel != PIXEL_4x4 )
            x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4] );
    }

    return h->out.bs.i_bits_encoded;
}

static int x264_cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
{
    if( x264_mb_predict_intra4x4_mode( h, i4 ) == x264_mb_pred_mode4x4_fix( i_mode ) )
        return 1;
    else
        return 4;
}

static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
{
    int plane_count = CHROMA444 ? 3 : 1;
    h->out.bs.i_bits_encoded = x264_cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
    bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
    for( int p = 0; p < plane_count; p++ )
        x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 );
    return h->out.bs.i_bits_encoded;
}

static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
{
    int plane_count = CHROMA444 ? 3 : 1;
    h->out.bs.i_bits_encoded = x264_cavlc_intra4x4_pred_size( h, i4, i_mode );
    for( int p = 0; p < plane_count; p++ )
        x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
    return h->out.bs.i_bits_encoded;
}

static int x264_chroma_size_cavlc( x264_t *h )
{
    h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
    if( h->mb.i_cbp_chroma )
    {
        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );

        if( h->mb.i_cbp_chroma == 2 )
        {
            int step = 8 << CHROMA_V_SHIFT;
            for( int i = 16; i < 3*16; i += step )
                for( int j = i; j < i+4; j++ )
                    x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
        }
    }
    return h->out.bs.i_bits_encoded;
}
#endif
x264-snapshot-20120103-2245-stable/encoder/analyse.h0000644000175000001440000000374111700673342021122 0ustar  videolanusers/*****************************************************************************
 * analyse.h: macroblock analysis
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ANALYSE_H
#define X264_ANALYSE_H

float *x264_analyse_prepare_costs( x264_t *h );
int x264_analyse_init_costs( x264_t *h, float *logs, int qp );
void x264_analyse_free_costs( x264_t *h );
void x264_analyse_weight_frame( x264_t *h, int end );
void x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h );

void x264_slicetype_analyse( x264_t *h, int keyframe );

int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );

int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
int  x264_lookahead_is_empty( x264_t *h );
void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
void x264_lookahead_get_frames( x264_t *h );
void x264_lookahead_delete( x264_t *h );

#endif
x264-snapshot-20120103-2245-stable/encoder/analyse.c0000644000175000001440000050766311700673342021131 0ustar  videolanusers/*****************************************************************************
 * analyse.c: macroblock analysis
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#define _ISOC99_SOURCE

#include "common/common.h"
#include "macroblock.h"
#include "me.h"
#include "ratecontrol.h"
#include "analyse.h"
#include "rdo.c"

typedef struct
{
    /* 16x16 */
    int       i_rd16x16;
    x264_me_t me16x16;
    x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */

    /* 8x8 */
    int       i_cost8x8;
    /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
    ALIGNED_4( int16_t mvc[32][5][2] );
    x264_me_t me8x8[4];

    /* Sub 4x4 */
    int       i_cost4x4[4]; /* cost per 8x8 partition */
    x264_me_t me4x4[4][4];

    /* Sub 8x4 */
    int       i_cost8x4[4]; /* cost per 8x8 partition */
    x264_me_t me8x4[4][2];

    /* Sub 4x8 */
    int       i_cost4x8[4]; /* cost per 8x8 partition */
    x264_me_t me4x8[4][2];

    /* 16x8 */
    int       i_cost16x8;
    x264_me_t me16x8[2];

    /* 8x16 */
    int       i_cost8x16;
    x264_me_t me8x16[2];

} x264_mb_analysis_list_t;

typedef struct
{
    /* conduct the analysis using this lamda and QP */
    int i_lambda;
    int i_lambda2;
    int i_qp;
    uint16_t *p_cost_mv;
    uint16_t *p_cost_ref[2];
    int i_mbrd;


    /* I: Intra part */
    /* Take some shortcuts in intra search if intra is deemed unlikely */
    int b_fast_intra;
    int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
    int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
    int b_try_skip;

    /* Luma part */
    int i_satd_i16x16;
    int i_satd_i16x16_dir[7];
    int i_predict16x16;

    int i_satd_i8x8;
    int i_cbp_i8x8_luma;
    ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
    int i_predict8x8[4];

    int i_satd_i4x4;
    int i_predict4x4[16];

    int i_satd_pcm;

    /* Chroma part */
    int i_satd_chroma;
    int i_satd_chroma_dir[7];
    int i_predict8x8chroma;

    /* II: Inter part P/B frame */
    x264_mb_analysis_list_t l0;
    x264_mb_analysis_list_t l1;

    int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
    int i_cost16x16direct;
    int i_cost8x8bi;
    int i_cost8x8direct[4];
    int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
    int i_cost_est16x8[2]; /* Per-partition estimated cost */
    int i_cost_est8x16[2];
    int i_cost16x8bi;
    int i_cost8x16bi;
    int i_rd16x16bi;
    int i_rd16x16direct;
    int i_rd16x8bi;
    int i_rd8x16bi;
    int i_rd8x8bi;

    int i_mb_partition16x8[2]; /* mb_partition_e */
    int i_mb_partition8x16[2];
    int i_mb_type16x8; /* mb_class_e */
    int i_mb_type8x16;

    int b_direct_available;
    int b_early_terminate;

} x264_mb_analysis_t;

/* lambda = pow(2,qp/6-2) */
const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
{
   1,   1,   1,   1,   1,   1,   1,   1, /*  0- 7 */
   1,   1,   1,   1,   1,   1,   1,   1, /*  8-15 */
   2,   2,   2,   2,   3,   3,   3,   4, /* 16-23 */
   4,   4,   5,   6,   6,   7,   8,   9, /* 24-31 */
  10,  11,  13,  14,  16,  18,  20,  23, /* 32-39 */
  25,  29,  32,  36,  40,  45,  51,  57, /* 40-47 */
  64,  72,  81,  91, 102, 114, 128, 144, /* 48-55 */
 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
2048,2299,                               /* 80-81 */
};

/* lambda2 = pow(lambda,2) * .9 * 256 */
/* Capped to avoid overflow */
const int x264_lambda2_tab[QP_MAX_MAX+1] =
{
       14,       18,       22,       28,       36,       45,      57,      72, /*  0- 7 */
       91,      115,      145,      182,      230,      290,     365,     460, /*  8-15 */
      580,      731,      921,     1161,     1462,     1843,    2322,    2925, /* 16-23 */
     3686,     4644,     5851,     7372,     9289,    11703,   14745,   18578, /* 24-31 */
    23407,    29491,    37156,    46814,    58982,    74313,   93628,  117964, /* 32-39 */
   148626,   187257,   235929,   297252,   374514,   471859,  594505,  749029, /* 40-47 */
   943718,  1189010,  1498059,  1887436,  2378021,  2996119, 3774873, 4756042, /* 48-55 */
  5992238,  7549747,  9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
 38048341, 47937906, 60397977, 76096683, 95875813,120795955,                   /* 64-69 */
134217727,134217727,134217727,134217727,134217727,134217727,                   /* 70-75 */
134217727,134217727,134217727,134217727,134217727,134217727,                   /* 76-81 */
};

const uint8_t x264_exp2_lut[64] =
{
      0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
     48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
    106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
    175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
};

const float x264_log2_lut[128] =
{
    0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
    0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
    0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
    0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
    0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
    0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
    0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
    0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
    0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
    0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
    0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
    0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
    0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
    0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
    0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
    0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
};

/* Avoid an int/float conversion. */
const float x264_log2_lz_lut[32] =
{
    31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
};

// should the intra and inter lambdas be different?
// I'm just matching the behaviour of deadzone quant.
static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
{
    // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
    {
               46,       58,       73,       92,      117,      147,
              185,      233,      294,      370,      466,      587,
              740,      932,     1174,     1480,     1864,     2349,
             2959,     3728,     4697,     5918,     7457,     9395,
            11837,    14914,    18790,    23674,    29828,    37581,
            47349,    59656,    75163,    94699,   119313,   150326,
           189399,   238627,   300652,   378798,   477255,   601304,
           757596,   954511,  1202608,  1515192,  1909022,  2405217,
          3030384,  3818045,  4810435,  6060769,  7636091,  9620872,
         12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
         48486154, 61088726, 76966972, 96972308,
        122177453,134217727,134217727,134217727,134217727,134217727,
        134217727,134217727,134217727,134217727,134217727,134217727,
    },
    // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
    {
               27,       34,       43,       54,       68,       86,
              108,      136,      172,      216,      273,      343,
              433,      545,      687,      865,     1090,     1374,
             1731,     2180,     2747,     3461,     4361,     5494,
             6922,     8721,    10988,    13844,    17442,    21976,
            27688,    34885,    43953,    55377,    69771,    87906,
           110755,   139543,   175813,   221511,   279087,   351627,
           443023,   558174,   703255,   886046,  1116348,  1406511,
          1772093,  2232697,  2813022,  3544186,  4465396,  5626046,
          7088374,  8930791, 11252092, 14176748, 17861583, 22504184,
         28353495, 35723165, 45008368, 56706990,
         71446330, 90016736,113413980,134217727,134217727,134217727,
        134217727,134217727,134217727,134217727,134217727,134217727,
        134217727,134217727,134217727,134217727,134217727,134217727,
    }
};

#define MAX_CHROMA_LAMBDA_OFFSET 36
static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
{
       16,    20,    25,    32,    40,    50,
       64,    80,   101,   128,   161,   203,
      256,   322,   406,   512,   645,   812,
     1024,  1290,  1625,  2048,  2580,  3250,
     4096,  5160,  6501,  8192, 10321, 13003,
    16384, 20642, 26007, 32768, 41285, 52015,
    65535
};

/* TODO: calculate CABAC costs */
static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
{
    9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
};
static const uint8_t i_mb_b16x8_cost_table[17] =
{
    0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
};
static const uint8_t i_sub_mb_b_cost_table[13] =
{
    7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
};
static const uint8_t i_sub_mb_p_cost_table[4] =
{
    5, 3, 3, 1
};

static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );

static uint16_t x264_cost_ref[QP_MAX+1][3][33];
static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];

float *x264_analyse_prepare_costs( x264_t *h )
{
    float *logs = x264_malloc( (2*4*2048+1)*sizeof(float) );
    if( !logs )
        return NULL;
    logs[0] = 0.718f;
    for( int i = 1; i <= 2*4*2048; i++ )
        logs[i] = log2f(i+1)*2 + 1.718f;
    return logs;
}

int x264_analyse_init_costs( x264_t *h, float *logs, int qp )
{
    int lambda = x264_lambda_tab[qp];
    if( h->cost_mv[qp] )
        return 0;
    /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
    CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
    h->cost_mv[qp] += 2*4*2048;
    for( int i = 0; i <= 2*4*2048; i++ )
    {
        h->cost_mv[qp][-i] =
        h->cost_mv[qp][i]  = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
    }
    x264_pthread_mutex_lock( &cost_ref_mutex );
    for( int i = 0; i < 3; i++ )
        for( int j = 0; j < 33; j++ )
            x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
    x264_pthread_mutex_unlock( &cost_ref_mutex );
    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
    {
        for( int j = 0; j < 4; j++ )
        {
            CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
            h->cost_mv_fpel[qp][j] += 2*2048;
            for( int i = -2*2048; i < 2*2048; i++ )
                h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
        }
    }
    uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
    for( int i = 0; i < 17; i++ )
        cost_i4x4_mode[i] = 3*lambda*(i!=8);
    return 0;
fail:
    return -1;
}

void x264_analyse_free_costs( x264_t *h )
{
    for( int i = 0; i < QP_MAX+1; i++ )
    {
        if( h->cost_mv[i] )
            x264_free( h->cost_mv[i] - 2*4*2048 );
        if( h->cost_mv_fpel[i][0] )
            for( int j = 0; j < 4; j++ )
                x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
    }
}

void x264_analyse_weight_frame( x264_t *h, int end )
{
    for( int j = 0; j < h->i_ref[0]; j++ )
    {
        if( h->sh.weight[j][0].weightfn )
        {
            x264_frame_t *frame = h->fref[0][j];
            int width = frame->i_width[0] + 2*PADH;
            int i_padv = PADV << PARAM_INTERLACED;
            int offset, height;
            pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
            height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
            offset = h->fenc->i_lines_weighted*frame->i_stride[0];
            h->fenc->i_lines_weighted += height;
            if( height )
                for( int k = j; k < h->i_ref[0]; k++ )
                    if( h->sh.weight[k][0].weightfn )
                    {
                        pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
                        x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
                                                 src + offset, frame->i_stride[0],
                                                 width, height, &h->sh.weight[k][0] );
                    }
            break;
        }
    }
}

/* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{
    a->p_cost_mv = h->cost_mv[a->i_qp];
    a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
    a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
}

static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
{
    int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
    a->i_lambda = x264_lambda_tab[qp];
    a->i_lambda2 = x264_lambda2_tab[qp];

    h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
    if( h->param.analyse.i_trellis )
    {
        h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
        h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
        h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
        h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
    }
    h->mb.i_psy_rd_lambda = a->i_lambda;
    /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
    int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
    h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;

    if( qp > QP_MAX_SPEC )
    {
        h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
        h->nr_residual_sum = h->nr_residual_sum_buf[1];
        h->nr_count = h->nr_count_buf[1];
        h->mb.b_noise_reduction = 1;
        qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
    }
    else
    {
        h->nr_offset = h->nr_offset_denoise;
        h->nr_residual_sum = h->nr_residual_sum_buf[0];
        h->nr_count = h->nr_count_buf[0];
        h->mb.b_noise_reduction = 0;
    }

    a->i_qp = h->mb.i_qp = qp;
    h->mb.i_chroma_qp = h->chroma_qp_table[qp];
}

static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
{
    int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);

    /* mbrd == 1 -> RD mode decision */
    /* mbrd == 2 -> RD refinement */
    /* mbrd == 3 -> QPRD */
    a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
    h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
    a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;

    x264_mb_analyse_init_qp( h, a, qp );

    h->mb.b_transform_8x8 = 0;

    /* I: Intra part */
    a->i_satd_i16x16 =
    a->i_satd_i8x8   =
    a->i_satd_i4x4   =
    a->i_satd_chroma = COST_MAX;

    /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
    a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;

    a->b_fast_intra = 0;
    a->b_avoid_topright = 0;
    h->mb.i_skip_intra =
        h->mb.b_lossless ? 0 :
        a->i_mbrd ? 2 :
        !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;

    /* II: Inter part P/B frame */
    if( h->sh.i_type != SLICE_TYPE_I )
    {
        int i_fmv_range = 4 * h->param.analyse.i_mv_range;
        // limit motion search to a slightly smaller range than the theoretical limit,
        // since the search may go a few iterations past its given range
        int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel

        /* Calculate max allowed MV range */
#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
        h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
        h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
        h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
        h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
        if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
        {
            int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
            int max_mv = max_x - 4*16*h->mb.i_mb_x;
            /* If we're left of the refresh bar, don't reference right of it. */
            if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
                h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
        }
        h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
        h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
        if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
        {
            int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
            int thread_mvy_range = i_fmv_range;

            if( h->i_thread_frames > 1 )
            {
                int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
                int thresh = pix_y + h->param.analyse.i_mv_range_thread;
                for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
                    for( int j = 0; j < h->i_ref[i]; j++ )
                    {
                        x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
                        thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
                    }

                if( h->param.b_deterministic )
                    thread_mvy_range = h->param.analyse.i_mv_range_thread;
                if( PARAM_INTERLACED )
                    thread_mvy_range >>= 1;

                x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
            }

            if( PARAM_INTERLACED )
            {
                /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
                for( int i = 0; i < 3; i++ )
                {
                    int j = i == 2;
                    mb_y = (h->mb.i_mb_y >> j) + (i == 1);
                    h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
                    h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
                    h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
                    h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
                    h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
                    h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
                    h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
                }
            }
            else
            {
                h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
                h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
                h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
                h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
                h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
                h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
                h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
            }
        }
        if( PARAM_INTERLACED )
        {
            int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
            h->mb.mv_min[1] = h->mb.mv_miny_row[i];
            h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
            h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
            h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
            h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i];
            h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i];
        }
#undef CLIP_FMV

        a->l0.me16x16.cost =
        a->l0.i_rd16x16    =
        a->l0.i_cost8x8    =
        a->l0.i_cost16x8   =
        a->l0.i_cost8x16   = COST_MAX;
        if( h->sh.i_type == SLICE_TYPE_B )
        {
            a->l1.me16x16.cost =
            a->l1.i_rd16x16    =
            a->l1.i_cost8x8    =
            a->i_cost8x8direct[0] =
            a->i_cost8x8direct[1] =
            a->i_cost8x8direct[2] =
            a->i_cost8x8direct[3] =
            a->l1.i_cost16x8   =
            a->l1.i_cost8x16   =
            a->i_rd16x16bi     =
            a->i_rd16x16direct =
            a->i_rd8x8bi       =
            a->i_rd16x8bi      =
            a->i_rd8x16bi      =
            a->i_cost16x16bi   =
            a->i_cost16x16direct =
            a->i_cost8x8bi     =
            a->i_cost16x8bi    =
            a->i_cost8x16bi    = COST_MAX;
        }
        else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
            for( int i = 0; i < 4; i++ )
            {
                a->l0.i_cost4x4[i] =
                a->l0.i_cost8x4[i] =
                a->l0.i_cost4x8[i] = COST_MAX;
            }

        /* Fast intra decision */
        if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
        {
            /* Always run in fast-intra mode for subme < 3 */
            if( h->mb.i_subpel_refine > 2 &&
              ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
                IS_INTRA( h->mb.i_mb_type_top ) ||
                IS_INTRA( h->mb.i_mb_type_topleft ) ||
                IS_INTRA( h->mb.i_mb_type_topright ) ||
                (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
                (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
            { /* intra is likely */ }
            else
            {
                a->b_fast_intra = 1;
            }
        }
        h->mb.b_skip_mc = 0;
        if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
            h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
        {
            a->b_force_intra = 1;
            a->b_fast_intra = 0;
            a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
        }
        else
            a->b_force_intra = 0;
    }
}

/* Prediction modes allowed for various combinations of neighbors. */
/* Terminated by a -1. */
/* In order, no neighbors, left, top, top/left, top/left/topleft */
static const int8_t i16x16_mode_available[5][5] =
{
    {I_PRED_16x16_DC_128, -1, -1, -1, -1},
    {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
    {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
    {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
    {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
};

static const int8_t chroma_mode_available[5][5] =
{
    {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
    {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
    {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
    {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
    {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
};

static const int8_t i4x4_mode_available[2][5][10] =
{
    {
        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
        {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
    },
    {
        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
        {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1},
    }
};

static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
{
    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
    return i16x16_mode_available[idx];
}

static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
{
    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
    return chroma_mode_available[idx];
}

static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
{
    int avoid_topright = force_intra && (i&1);
    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
    return i4x4_mode_available[avoid_topright][idx];
}

static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
{
    int avoid_topright = force_intra && ((i&5) == 5);
    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
    return i4x4_mode_available[avoid_topright][idx];
}

/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
{
    ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};

    if( do_both_dct || h->mb.b_transform_8x8 )
        h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
    if( do_both_dct || !h->mb.b_transform_8x8 )
        h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
}

/* Reset fenc satd scores cache for psy RD */
static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
{
    if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
        x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
    if( !h->mb.i_psy_rd )
        return;
    /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
    h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
    if( b_satd )
        h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
}

static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
{
    if( a->i_satd_chroma < COST_MAX )
        return;

    if( CHROMA444 )
    {
        if( !h->mb.b_chroma_me )
        {
            a->i_satd_chroma = 0;
            return;
        }

        /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
        if( h->mb.b_lossless )
        {
            x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
            x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
        }
        else
        {
            h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
            h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
        }
        a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
                         + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
        return;
    }

    const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];

    /* Prediction selection for chroma */
    if( predict_mode[3] >= 0 && !h->mb.b_lossless )
    {
        int satdu[4], satdv[4];
        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
        satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
        satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );

        for( ; *predict_mode >= 0; predict_mode++ )
        {
            int i_mode = *predict_mode;
            int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );

            a->i_satd_chroma_dir[i_mode] = i_satd;
            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
        }
    }
    else
    {
        for( ; *predict_mode >= 0; predict_mode++ )
        {
            int i_satd;
            int i_mode = *predict_mode;

            /* we do the prediction */
            if( h->mb.b_lossless )
                x264_predict_lossless_chroma( h, i_mode );
            else
            {
                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
            }

            /* we calculate the cost */
            i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
                     h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
                     a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );

            a->i_satd_chroma_dir[i_mode] = i_satd;
            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
        }
    }

    h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
}

/* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
{
    const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
    pixel *p_src = h->mb.pic.p_fenc[0];
    pixel *p_dst = h->mb.pic.p_fdec[0];
    static const int8_t intra_analysis_shortcut[2][2][2][5] =
    {
        {{{I_PRED_4x4_HU, -1, -1, -1, -1},
          {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
         {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
          {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
        {{{I_PRED_4x4_HU, -1, -1, -1, -1},
          {-1, -1, -1, -1, -1}},
         {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
          {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
    };

    int idx;
    int lambda = a->i_lambda;

    /*---------------- Try all mode and calculate their score ---------------*/

    /* 16x16 prediction selection */
    const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );

    /* Not heavily tuned */
    static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
    int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;

    if( !h->mb.b_lossless && predict_mode[3] >= 0 )
    {
        h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
        a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
        a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
        a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );

        /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
        if( a->i_satd_i16x16 <= i16x16_thresh )
        {
            h->predict_16x16[I_PRED_16x16_P]( p_dst );
            a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
            a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
        }
    }
    else
    {
        for( ; *predict_mode >= 0; predict_mode++ )
        {
            int i_satd;
            int i_mode = *predict_mode;

            if( h->mb.b_lossless )
                x264_predict_lossless_16x16( h, 0, i_mode );
            else
                h->predict_16x16[i_mode]( p_dst );

            i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
                     lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
            COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
            a->i_satd_i16x16_dir[i_mode] = i_satd;
        }
    }

    if( h->sh.i_type == SLICE_TYPE_B )
        /* cavlc mb type prefix */
        a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];

    if( a->i_satd_i16x16 > i16x16_thresh )
        return;

    uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
    /* 8x8 prediction selection */
    if( flags & X264_ANALYSE_I8x8 )
    {
        ALIGNED_ARRAY_32( pixel, edge,[36] );
        x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
        int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );

        // FIXME some bias like in i4x4?
        int i_cost = lambda * 4; /* base predmode costs */
        h->mb.i_cbp_luma = 0;

        if( h->sh.i_type == SLICE_TYPE_B )
            i_cost += lambda * i_mb_b_cost_table[I_8x8];

        for( idx = 0;; idx++ )
        {
            int x = idx&1;
            int y = idx>>1;
            pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
            pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
            int i_best = COST_MAX;
            int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );

            predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
            h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );

            if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
            {
                /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
                i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
                i_cost += i_best & 0xffff;
                i_best >>= 16;
                a->i_predict8x8[idx] = i_best;
                if( idx == 3 || i_cost > i_satd_thresh )
                    break;
                x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
            }
            else
            {
                if( !h->mb.b_lossless && predict_mode[5] >= 0 )
                {
                    int satd[9];
                    h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
                    int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
                    satd[i_pred_mode] -= 3 * lambda;
                    for( int i = 2; i >= 0; i-- )
                    {
                        int cost = satd[i];
                        a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
                        COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
                    }

                    /* Take analysis shortcuts: don't analyse modes that are too
                     * far away direction-wise from the favored mode. */
                    if( a->i_mbrd < 1 + a->b_fast_intra )
                        predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
                    else
                        predict_mode += 3;
                }

                for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
                {
                    int i_satd;
                    int i_mode = *predict_mode;

                    if( h->mb.b_lossless )
                        x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
                    else
                        h->predict_8x8[i_mode]( p_dst_by, edge );

                    i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
                    if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
                        i_satd -= 3 * lambda;

                    COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
                    a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
                }
                i_cost += i_best + 3*lambda;

                if( idx == 3 || i_cost > i_satd_thresh )
                    break;
                if( h->mb.b_lossless )
                    x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
                else
                    h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
                x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
            }
            /* we need to encode this block now (for next ones) */
            x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
        }

        if( idx == 3 )
        {
            a->i_satd_i8x8 = i_cost;
            if( h->mb.i_skip_intra )
            {
                h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
                h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
                h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
                h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
                h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
                h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
                if( h->mb.i_skip_intra == 2 )
                    h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
            }
        }
        else
        {
            static const uint16_t cost_div_fix8[3] = {1024,512,341};
            a->i_satd_i8x8 = COST_MAX;
            i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
        }
        /* Not heavily tuned */
        static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
        if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
            return;
    }

    /* 4x4 prediction selection */
    if( flags & X264_ANALYSE_I4x4 )
    {
        int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
        int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
        h->mb.i_cbp_luma = 0;

        if( a->b_early_terminate && a->i_mbrd )
            i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;

        if( h->sh.i_type == SLICE_TYPE_B )
            i_cost += lambda * i_mb_b_cost_table[I_4x4];

        for( idx = 0;; idx++ )
        {
            pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
            pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
            int i_best = COST_MAX;
            int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );

            predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );

            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                /* emulate missing topright samples */
                MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );

            if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
            {
                /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
                i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
                i_cost += i_best & 0xffff;
                i_best >>= 16;
                a->i_predict4x4[idx] = i_best;
                if( i_cost > i_satd_thresh || idx == 15 )
                    break;
                h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
            }
            else
            {
                if( !h->mb.b_lossless && predict_mode[5] >= 0 )
                {
                    int satd[9];
                    h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
                    int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
                    satd[i_pred_mode] -= 3 * lambda;
                    i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
                    COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
                    COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );

                    /* Take analysis shortcuts: don't analyse modes that are too
                     * far away direction-wise from the favored mode. */
                    if( a->i_mbrd < 1 + a->b_fast_intra )
                        predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
                    else
                        predict_mode += 3;
                }

                if( i_best > 0 )
                {
                    for( ; *predict_mode >= 0; predict_mode++ )
                    {
                        int i_satd;
                        int i_mode = *predict_mode;

                        if( h->mb.b_lossless )
                            x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
                        else
                            h->predict_4x4[i_mode]( p_dst_by );

                        i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
                        if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
                        {
                            i_satd -= lambda * 3;
                            if( i_satd <= 0 )
                            {
                                i_best = i_satd;
                                a->i_predict4x4[idx] = i_mode;
                                break;
                            }
                        }

                        COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
                    }
                }

                i_cost += i_best + 3 * lambda;
                if( i_cost > i_satd_thresh || idx == 15 )
                    break;
                if( h->mb.b_lossless )
                    x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
                else
                    h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
                h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
            }
            /* we need to encode this block now (for next ones) */
            x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
        }
        if( idx == 15 )
        {
            a->i_satd_i4x4 = i_cost;
            if( h->mb.i_skip_intra )
            {
                h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
                h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
                h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
                h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
                h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
                h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
                if( h->mb.i_skip_intra == 2 )
                    h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
            }
        }
        else
            a->i_satd_i4x4 = COST_MAX;
    }
}

static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
{
    if( !a->b_early_terminate )
        i_satd_thresh = COST_MAX;

    if( a->i_satd_i16x16 < i_satd_thresh )
    {
        h->mb.i_type = I_16x16;
        x264_analyse_update_cache( h, a );
        a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
    }
    else
        a->i_satd_i16x16 = COST_MAX;

    if( a->i_satd_i4x4 < i_satd_thresh )
    {
        h->mb.i_type = I_4x4;
        x264_analyse_update_cache( h, a );
        a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
    }
    else
        a->i_satd_i4x4 = COST_MAX;

    if( a->i_satd_i8x8 < i_satd_thresh )
    {
        h->mb.i_type = I_8x8;
        x264_analyse_update_cache( h, a );
        a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
        a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
    }
    else
        a->i_satd_i8x8 = COST_MAX;
}

static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
{
    uint64_t i_satd, i_best;
    int plane_count = CHROMA444 ? 3 : 1;
    h->mb.i_skip_intra = 0;

    if( h->mb.i_type == I_16x16 )
    {
        int old_pred_mode = a->i_predict16x16;
        const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
        int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
        i_best = a->i_satd_i16x16;
        for( ; *predict_mode >= 0; predict_mode++ )
        {
            int i_mode = *predict_mode;
            if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
                continue;
            h->mb.i_intra16x16_pred_mode = i_mode;
            i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
            COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
        }
    }

    /* RD selection for chroma prediction */
    if( !CHROMA444 )
    {
        const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
        if( predict_mode[1] >= 0 )
        {
            int8_t predict_mode_sorted[4];
            int i_max;
            int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;

            for( i_max = 0; *predict_mode >= 0; predict_mode++ )
            {
                int i_mode = *predict_mode;
                if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
                    predict_mode_sorted[i_max++] = i_mode;
            }

            if( i_max > 0 )
            {
                int i_cbp_chroma_best = h->mb.i_cbp_chroma;
                int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
                /* the previous thing encoded was x264_intra_rd(), so the pixels and
                 * coefs for the current chroma mode are still around, so we only
                 * have to recount the bits. */
                i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
                for( int i = 0; i < i_max; i++ )
                {
                    int i_mode = predict_mode_sorted[i];
                    if( h->mb.b_lossless )
                        x264_predict_lossless_chroma( h, i_mode );
                    else
                    {
                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
                    }
                    /* if we've already found a mode that needs no residual, then
                     * probably any mode with a residual will be worse.
                     * so avoid dct on the remaining modes to improve speed. */
                    i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
                    COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
                }
                h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
                h->mb.i_cbp_chroma = i_cbp_chroma_best;
            }
        }
    }

    if( h->mb.i_type == I_4x4 )
    {
        pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
        int nnz[3] = {0};
        for( int idx = 0; idx < 16; idx++ )
        {
            pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
                             h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
                             h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
            i_best = COST_MAX64;

            const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );

            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                for( int p = 0; p < plane_count; p++ )
                    /* emulate missing topright samples */
                    MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );

            for( ; *predict_mode >= 0; predict_mode++ )
            {
                int i_mode = *predict_mode;
                i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );

                if( i_best > i_satd )
                {
                    a->i_predict4x4[idx] = i_mode;
                    i_best = i_satd;
                    for( int p = 0; p < plane_count; p++ )
                    {
                        pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
                        pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
                        pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
                        pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
                        nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
                    }
                }
            }

            for( int p = 0; p < plane_count; p++ )
            {
                MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
                MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
                MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
                MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
                h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
            }

            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
        }
    }
    else if( h->mb.i_type == I_8x8 )
    {
        ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
        pixel4 pels_h[3][2] = {{0}};
        pixel pels_v[3][7] = {{0}};
        uint16_t nnz[3][2] = {{0}}; //shut up gcc
        for( int idx = 0; idx < 4; idx++ )
        {
            int x = idx&1;
            int y = idx>>1;
            int s8 = X264_SCAN8_0 + 2*x + 16*y;
            pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
                             h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
                             h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
            int cbp_luma_new = 0;
            int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;

            i_best = COST_MAX64;

            const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
            for( int p = 0; p < plane_count; p++ )
                h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );

            for( ; *predict_mode >= 0; predict_mode++ )
            {
                int i_mode = *predict_mode;
                if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
                    continue;

                h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
                i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );

                if( i_best > i_satd )
                {
                    a->i_predict8x8[idx] = i_mode;
                    cbp_luma_new = h->mb.i_cbp_luma;
                    i_best = i_satd;

                    for( int p = 0; p < plane_count; p++ )
                    {
                        pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
                        pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
                        if( !(idx&1) )
                            for( int j = 0; j < 7; j++ )
                                pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
                        nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
                        nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
                    }
                }
            }
            a->i_cbp_i8x8_luma = cbp_luma_new;
            for( int p = 0; p < plane_count; p++ )
            {
                MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
                MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
                if( !(idx&1) )
                    for( int j = 0; j < 7; j++ )
                        dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
                M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
                M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
            }

            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
        }
    }
}

#define LOAD_FENC(m, src, xoff, yoff) \
{ \
    (m)->p_cost_mv = a->p_cost_mv; \
    (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
    (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
    (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
    (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
    (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
    (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
}

#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
{ \
    (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
    (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
    (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
    (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
    if( CHROMA444 ) \
    { \
        (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
        (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
        (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
        (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
        (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
        (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
        (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
        (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
    } \
    else \
        (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
    (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
    (m)->weight = x264_weight_none; \
    (m)->i_ref = ref; \
}

#define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
    (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
    (m)->weight = h->sh.weight[i_ref];

#define REF_COST(list, ref) \
    (a->p_cost_ref[list][ref])

static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
    x264_me_t m;
    int i_mvc;
    ALIGNED_4( int16_t mvc[8][2] );
    int i_halfpel_thresh = INT_MAX;
    int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;

    /* 16x16 Search on all ref frame */
    m.i_pixel = PIXEL_16x16;
    LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );

    a->l0.me16x16.cost = INT_MAX;
    for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
    {
        m.i_ref_cost = REF_COST( 0, i_ref );
        i_halfpel_thresh -= m.i_ref_cost;

        /* search with ref */
        LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
        LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );

        x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );

        if( h->mb.ref_blind_dupe == i_ref )
        {
            CP32( m.mv, a->l0.mvc[0][0] );
            x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
        }
        else
        {
            x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
            x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
        }

        /* save mv for predicting neighbors */
        CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
        CP32( a->l0.mvc[i_ref][0], m.mv );

        /* early termination
         * SSD threshold would probably be better than SATD */
        if( i_ref == 0
            && a->b_try_skip
            && m.cost-m.cost_mv < 300*a->i_lambda
            &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
              + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
            && x264_macroblock_probe_pskip( h ) )
        {
            h->mb.i_type = P_SKIP;
            x264_analyse_update_cache( h, a );
            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
            return;
        }

        m.cost += m.i_ref_cost;
        i_halfpel_thresh += m.i_ref_cost;

        if( m.cost < a->l0.me16x16.cost )
            h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
    }

    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
    assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );

    h->mb.i_type = P_L0;
    if( a->i_mbrd )
    {
        x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
        if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
        {
            h->mb.i_partition = D_16x16;
            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
            a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
            if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
                h->mb.i_type = P_SKIP;
        }
    }
}

static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
{
    x264_me_t m;
    pixel **p_fenc = h->mb.pic.p_fenc;
    int i_maxref = h->mb.pic.i_fref[0]-1;

    h->mb.i_partition = D_8x8;

    #define CHECK_NEIGHBOUR(i)\
    {\
        int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
        if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
            i_maxref = ref;\
    }

    /* early termination: if 16x16 chose ref 0, then evalute no refs older
     * than those used by the neighbors */
    if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
        h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
    {
        i_maxref = 0;
        CHECK_NEIGHBOUR(  -8 - 1 );
        CHECK_NEIGHBOUR(  -8 + 0 );
        CHECK_NEIGHBOUR(  -8 + 2 );
        CHECK_NEIGHBOUR(  -8 + 4 );
        CHECK_NEIGHBOUR(   0 - 1 );
        CHECK_NEIGHBOUR( 2*8 - 1 );
    }
    #undef CHECK_NEIGHBOUR

    for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
        CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );

    for( int i = 0; i < 4; i++ )
    {
        x264_me_t *l0m = &a->l0.me8x8[i];
        int x8 = i&1;
        int y8 = i>>1;

        m.i_pixel = PIXEL_8x8;

        LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
        l0m->cost = INT_MAX;
        for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
        {
            m.i_ref_cost = REF_COST( 0, i_ref );

            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );

            x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
            x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
            if( h->mb.ref_blind_dupe == i_ref )
            {
                CP32( m.mv, a->l0.mvc[0][i+1] );
                x264_me_refine_qpel_refdupe( h, &m, NULL );
            }
            else
                x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );

            m.cost += m.i_ref_cost;

            CP32( a->l0.mvc[i_ref][i+1], m.mv );

            if( m.cost < l0m->cost )
                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
            if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
                i_ref = h->mb.ref_blind_dupe;
            else
                i_ref++;
        }
        x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
        x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );

        a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );

        /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
           are effectively zero. */
        if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
            l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
    }

    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
    /* P_8x8 ref0 has no ref cost */
    if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
                               a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
        a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
}

static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
{
    /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
     * reference frame flags.  Thus, if we're not doing mixedrefs, just
     * don't bother analysing the dupes. */
    const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
    const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
    pixel **p_fenc = h->mb.pic.p_fenc;
    int i_mvc;
    int16_t (*mvc)[2] = a->l0.mvc[i_ref];

    /* XXX Needed for x264_mb_predict_mv */
    h->mb.i_partition = D_8x8;

    i_mvc = 1;
    CP32( mvc[0], a->l0.me16x16.mv );

    for( int i = 0; i < 4; i++ )
    {
        x264_me_t *m = &a->l0.me8x8[i];
        int x8 = i&1;
        int y8 = i>>1;

        m->i_pixel = PIXEL_8x8;
        m->i_ref_cost = i_ref_cost;

        LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
        LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );

        x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
        x264_me_search( h, m, mvc, i_mvc );

        x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );

        CP32( mvc[i_mvc], m->mv );
        i_mvc++;

        a->i_satd8x8[0][i] = m->cost - m->cost_mv;

        /* mb type cost */
        m->cost += i_ref_cost;
        if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
            m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
    }

    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
    /* theoretically this should include 4*ref_cost,
     * but 3 seems a better approximation of cabac. */
    if( h->param.b_cabac )
        a->l0.i_cost8x8 -= i_ref_cost;
    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
}

static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
    x264_me_t m;
    pixel **p_fenc = h->mb.pic.p_fenc;
    ALIGNED_4( int16_t mvc[3][2] );

    /* XXX Needed for x264_mb_predict_mv */
    h->mb.i_partition = D_16x8;

    for( int i = 0; i < 2; i++ )
    {
        x264_me_t *l0m = &a->l0.me16x8[i];
        const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
        const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
        const int ref8[2] = { minref, maxref };
        const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;

        m.i_pixel = PIXEL_16x8;

        LOAD_FENC( &m, p_fenc, 0, 8*i );
        l0m->cost = INT_MAX;
        for( int j = 0; j < i_ref8s; j++ )
        {
            const int i_ref = ref8[j];
            m.i_ref_cost = REF_COST( 0, i_ref );

            /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
            CP32( mvc[0], a->l0.mvc[i_ref][0] );
            CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
            CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );

            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );

            x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
            x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
            /* We can only take this shortcut if the first search was performed on ref0. */
            if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
            {
                /* We can just leave the MV from the previous ref search. */
                x264_me_refine_qpel_refdupe( h, &m, NULL );
            }
            else
                x264_me_search( h, &m, mvc, 3 );

            m.cost += m.i_ref_cost;

            if( m.cost < l0m->cost )
                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
        }

        /* Early termination based on the current SATD score of partition[0]
           plus the estimated SATD score of partition[1] */
        if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
        {
            a->l0.i_cost16x8 = COST_MAX;
            return;
        }

        x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
        x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
    }

    a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
}

static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
    x264_me_t m;
    pixel **p_fenc = h->mb.pic.p_fenc;
    ALIGNED_4( int16_t mvc[3][2] );

    /* XXX Needed for x264_mb_predict_mv */
    h->mb.i_partition = D_8x16;

    for( int i = 0; i < 2; i++ )
    {
        x264_me_t *l0m = &a->l0.me8x16[i];
        const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
        const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
        const int ref8[2] = { minref, maxref };
        const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;

        m.i_pixel = PIXEL_8x16;

        LOAD_FENC( &m, p_fenc, 8*i, 0 );
        l0m->cost = INT_MAX;
        for( int j = 0; j < i_ref8s; j++ )
        {
            const int i_ref = ref8[j];
            m.i_ref_cost = REF_COST( 0, i_ref );

            CP32( mvc[0], a->l0.mvc[i_ref][0] );
            CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
            CP32( mvc[2], a->l0.mvc[i_ref][i+3] );

            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );

            x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
            x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
            /* We can only take this shortcut if the first search was performed on ref0. */
            if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
            {
                /* We can just leave the MV from the previous ref search. */
                x264_me_refine_qpel_refdupe( h, &m, NULL );
            }
            else
                x264_me_search( h, &m, mvc, 3 );

            m.cost += m.i_ref_cost;

            if( m.cost < l0m->cost )
                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
        }

        /* Early termination based on the current SATD score of partition[0]
           plus the estimated SATD score of partition[1] */
        if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
        {
            a->l0.i_cost8x16 = COST_MAX;
            return;
        }

        x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
        x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
    }

    a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
}

static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
                                                                     pixel **p_fref, int i8x8, int size, int chroma )
{
    ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
    pixel *pix2 = pix1+8;
    int i_stride = h->mb.pic.i_stride[1];
    int chroma_h_shift = chroma <= CHROMA_422;
    int chroma_v_shift = chroma == CHROMA_420;
    int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
    int i_ref = a->l0.me8x8[i8x8].i_ref;
    int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
    x264_weight_t *weight = h->sh.weight[i_ref];

    // FIXME weight can be done on 4x4 blocks even if mc is smaller
#define CHROMA4x4MC( width, height, me, x, y ) \
    if( chroma == CHROMA_444 ) \
    { \
        int mvx = (me).mv[0] + 4*2*x; \
        int mvy = (me).mv[1] + 4*2*y; \
        h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
                       mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
        h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
                       mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
    } \
    else \
    { \
        int offset = x + (2>>chroma_v_shift)*16*y; \
        int chroma_height = (2>>chroma_v_shift)*height; \
        h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
                         (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
        if( weight[1].weightfn ) \
            weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
        if( weight[2].weightfn ) \
            weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
    }

    if( size == PIXEL_4x4 )
    {
        x264_me_t *m = a->l0.me4x4[i8x8];
        CHROMA4x4MC( 2,2, m[0], 0,0 );
        CHROMA4x4MC( 2,2, m[1], 2,0 );
        CHROMA4x4MC( 2,2, m[2], 0,2 );
        CHROMA4x4MC( 2,2, m[3], 2,2 );
    }
    else if( size == PIXEL_8x4 )
    {
        x264_me_t *m = a->l0.me8x4[i8x8];
        CHROMA4x4MC( 4,2, m[0], 0,0 );
        CHROMA4x4MC( 4,2, m[1], 0,2 );
    }
    else
    {
        x264_me_t *m = a->l0.me4x8[i8x8];
        CHROMA4x4MC( 2,4, m[0], 0,0 );
        CHROMA4x4MC( 2,4, m[1], 2,0 );
    }
#undef CHROMA4x4MC

    int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
    int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
    return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
         + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
}

static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
{
    if( CHROMA_FORMAT == CHROMA_444 )
        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
    else if( CHROMA_FORMAT == CHROMA_422 )
        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
    else
        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
}

static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
    pixel **p_fenc = h->mb.pic.p_fenc;
    const int i_ref = a->l0.me8x8[i8x8].i_ref;

    /* XXX Needed for x264_mb_predict_mv */
    h->mb.i_partition = D_8x8;

    for( int i4x4 = 0; i4x4 < 4; i4x4++ )
    {
        const int idx = 4*i8x8 + i4x4;
        const int x4 = block_idx_x[idx];
        const int y4 = block_idx_y[idx];
        const int i_mvc = (i4x4 == 0);

        x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];

        m->i_pixel = PIXEL_4x4;

        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );

        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
        x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );

        x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
    }
    a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
                            a->l0.me4x4[i8x8][1].cost +
                            a->l0.me4x4[i8x8][2].cost +
                            a->l0.me4x4[i8x8][3].cost +
                            REF_COST( 0, i_ref ) +
                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
    if( h->mb.b_chroma_me )
        a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
}

static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
    pixel **p_fenc = h->mb.pic.p_fenc;
    const int i_ref = a->l0.me8x8[i8x8].i_ref;

    /* XXX Needed for x264_mb_predict_mv */
    h->mb.i_partition = D_8x8;

    for( int i8x4 = 0; i8x4 < 2; i8x4++ )
    {
        const int idx = 4*i8x8 + 2*i8x4;
        const int x4 = block_idx_x[idx];
        const int y4 = block_idx_y[idx];
        const int i_mvc = (i8x4 == 0);

        x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];

        m->i_pixel = PIXEL_8x4;

        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );

        x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
        x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );

        x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
    }
    a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
                            REF_COST( 0, i_ref ) +
                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
    if( h->mb.b_chroma_me )
        a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
}

static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
    pixel **p_fenc = h->mb.pic.p_fenc;
    const int i_ref = a->l0.me8x8[i8x8].i_ref;

    /* XXX Needed for x264_mb_predict_mv */
    h->mb.i_partition = D_8x8;

    for( int i4x8 = 0; i4x8 < 2; i4x8++ )
    {
        const int idx = 4*i8x8 + i4x8;
        const int x4 = block_idx_x[idx];
        const int y4 = block_idx_y[idx];
        const int i_mvc = (i4x8 == 0);

        x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];

        m->i_pixel = PIXEL_4x8;

        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );

        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
        x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );

        x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
    }
    a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
                            REF_COST( 0, i_ref ) +
                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
    if( h->mb.b_chroma_me )
        a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
}

static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
{
    ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] );
    ALIGNED_ARRAY_16( pixel,  bi, [2],[16*16] );
    int i_chroma_cost = 0;
    int chromapix = h->luma2chroma_pixel[i_pixel];

#define COST_BI_CHROMA( m0, m1, width, height ) \
{ \
    if( CHROMA444 ) \
    { \
        h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
        h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
        h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
        h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
    } \
    else \
    { \
        int v_shift = CHROMA_V_SHIFT; \
        int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
        int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
        h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
                         m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
        h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
                         m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
    } \
    h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
    h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
    i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
                  + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
}

    if( i_pixel == PIXEL_16x16 )
        COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
    else if( i_pixel == PIXEL_16x8 )
        COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
    else if( i_pixel == PIXEL_8x16 )
        COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
    else
        COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )

    return i_chroma_cost;
}

static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
{
    /* Assumes that fdec still contains the results of
     * x264_mb_predict_mv_direct16x16 and x264_mb_mc */

    pixel *p_fenc = h->mb.pic.p_fenc[0];
    pixel *p_fdec = h->mb.pic.p_fdec[0];

    a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
    if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
    {
        int chromapix = h->luma2chroma_pixel[PIXEL_8x8];

        for( int i = 0; i < 4; i++ )
        {
            const int x = (i&1)*8;
            const int y = (i>>1)*8;
            a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
                                                              &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
            if( h->mb.b_chroma_me )
            {
                int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
                int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
                a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
                                                                   &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
                                       + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
                                                                   &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
            }
            a->i_cost16x16direct += a->i_cost8x8direct[i];

            /* mb type cost */
            a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
        }
    }
    else
    {
        a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
        if( h->mb.b_chroma_me )
        {
            int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
            a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
                                 +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
        }
    }
}

static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
    ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
    ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
    pixel *src0, *src1;
    int stride0 = 16, stride1 = 16;
    int i_ref, i_mvc;
    ALIGNED_4( int16_t mvc[9][2] );
    int try_skip = a->b_try_skip;
    int list1_skipped = 0;
    int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
    int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
                                (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};

    x264_me_t m;
    m.i_pixel = PIXEL_16x16;

    LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );

    /* 16x16 Search on list 0 and list 1 */
    a->l0.me16x16.cost = INT_MAX;
    a->l1.me16x16.cost = INT_MAX;
    for( int l = 1; l >= 0; )
    {
        x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;

        /* This loop is extremely munged in order to facilitate the following order of operations,
         * necessary for an efficient fast skip.
         * 1.  Search list1 ref0.
         * 2.  Search list0 ref0.
         * 3.  Try skip.
         * 4.  Search the rest of list0.
         * 5.  Go back and finish list1.
         */
        for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
        {
            if( try_skip && l == 1 && i_ref > 0 )
            {
                list1_skipped = 1;
                break;
            }

            m.i_ref_cost = REF_COST( l, i_ref );

            /* search with ref */
            LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
            x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
            x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
            x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );

            /* add ref cost */
            m.cost += m.i_ref_cost;

            if( m.cost < lX->me16x16.cost )
                h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );

            /* save mv for predicting neighbors */
            CP32( lX->mvc[i_ref][0], m.mv );
            CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );

            /* Fast skip detection. */
            if( i_ref == 0 && try_skip )
            {
                if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
                    abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
                {
                    try_skip = 0;
                }
                else if( !l )
                {
                    /* We already tested skip */
                    h->mb.i_type = B_SKIP;
                    x264_analyse_update_cache( h, a );
                    return;
                }
            }
        }
        if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
            break;
        if( list1_skipped && l == 0 )
            l = 1;
        else
            l--;
    }

    /* get cost of BI mode */
    h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
    h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
    int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
    src0 = h->mc.get_ref( pix0, &stride0,
                          h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
                          a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
    src1 = h->mc.get_ref( pix1, &stride1,
                          h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
                          a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );

    h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );

    a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
                     + ref_costs
                     + a->l0.bi16x16.cost_mv
                     + a->l1.bi16x16.cost_mv;

    if( h->mb.b_chroma_me )
        a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );

    /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
    if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
    {
        int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
                       + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
        int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
                       + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
        h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
                                h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
                                h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
        int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
                   + ref_costs + l0_mv_cost + l1_mv_cost;

        if( h->mb.b_chroma_me )
        {
            ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );

            if( CHROMA444 )
            {
                h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
                                        h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
                cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
                h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
                                        h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
                cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
            }
            else
            {
                ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
                int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
                int v_shift = CHROMA_V_SHIFT;

                if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
                {
                    int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
                    h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
                                     h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
                }
                else
                    h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
                                                         h->mb.pic.i_stride[1], 16>>v_shift );

                if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
                {
                    int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
                    h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
                                     h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
                }
                else
                    h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
                                                         h->mb.pic.i_stride[1], 16>>v_shift );

                h->mc.avg[chromapix]( bi,   FENC_STRIDE, pixuv[0],   FENC_STRIDE, pixuv[1],   FENC_STRIDE,
                                      h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
                h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
                                      h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );

                cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi,   FENC_STRIDE )
                       +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
            }
        }

        if( cost00 < a->i_cost16x16bi )
        {
            M32( a->l0.bi16x16.mv ) = 0;
            M32( a->l1.bi16x16.mv ) = 0;
            a->l0.bi16x16.cost_mv = l0_mv_cost;
            a->l1.bi16x16.cost_mv = l1_mv_cost;
            a->i_cost16x16bi = cost00;
        }
    }

    /* mb type cost */
    a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
    a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
    a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
}

static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
{
    int x = 2*(i&1);
    int y = i&2;

    switch( h->mb.i_sub_partition[i] )
    {
        case D_L0_8x8:
            x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
            break;
        case D_L0_8x4:
            x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
            x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
            break;
        case D_L0_4x8:
            x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
            x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
            break;
        case D_L0_4x4:
            x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
            x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
            x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
            x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
            break;
        default:
            x264_log( h, X264_LOG_ERROR, "internal error\n" );
            break;
    }
}

static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
{
    int x = 2*(idx&1);
    int y = idx&2;
    x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
    x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
    x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
    x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
}

#define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
    if( x264_mb_partition_listX_table[0][part] ) \
    { \
        x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
        x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
    } \
    else \
    { \
        x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
        x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
        if( b_mvd ) \
            x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
    } \
    if( x264_mb_partition_listX_table[1][part] ) \
    { \
        x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
        x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
    } \
    else \
    { \
        x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
        x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
        if( b_mvd ) \
            x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
    }

static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
{
    int x = 2*(i&1);
    int y = i&2;
    if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
    {
        x264_mb_load_mv_direct8x8( h, i );
        if( b_mvd )
        {
            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
            x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
        }
    }
    else
    {
        CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
    }
}
static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
{
    CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
}
static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
{
    CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
}
#undef CACHE_MV_BI

static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
{
    ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
    int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};

    /* early termination: if 16x16 chose ref 0, then evalute no refs older
     * than those used by the neighbors */
    #define CHECK_NEIGHBOUR(i)\
    {\
        int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
        if( ref > i_maxref[l] )\
            i_maxref[l] = ref;\
    }

    for( int l = 0; l < 2; l++ )
    {
        x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
        if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
            h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
        {
            i_maxref[l] = 0;
            CHECK_NEIGHBOUR(  -8 - 1 );
            CHECK_NEIGHBOUR(  -8 + 0 );
            CHECK_NEIGHBOUR(  -8 + 2 );
            CHECK_NEIGHBOUR(  -8 + 4 );
            CHECK_NEIGHBOUR(   0 - 1 );
            CHECK_NEIGHBOUR( 2*8 - 1 );
        }
    }

    /* XXX Needed for x264_mb_predict_mv */
    h->mb.i_partition = D_8x8;

    a->i_cost8x8bi = 0;

    for( int i = 0; i < 4; i++ )
    {
        int x8 = i&1;
        int y8 = i>>1;
        int i_part_cost;
        int i_part_cost_bi;
        int stride[2] = {8,8};
        pixel *src[2];
        x264_me_t m;
        m.i_pixel = PIXEL_8x8;
        LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );

        for( int l = 0; l < 2; l++ )
        {
            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;

            lX->me8x8[i].cost = INT_MAX;
            for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
            {
                m.i_ref_cost = REF_COST( l, i_ref );

                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );

                x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
                x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
                x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
                m.cost += m.i_ref_cost;

                if( m.cost < lX->me8x8[i].cost )
                {
                    h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
                    a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
                }

                /* save mv for predicting other partitions within this MB */
                CP32( lX->mvc[i_ref][i+1], m.mv );
            }
        }

        /* BI mode */
        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
                                a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
                                a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
        h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
                                h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );

        a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
        i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
                         + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];

        if( h->mb.b_chroma_me )
        {
            int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
            i_part_cost_bi += i_chroma_cost;
            a->i_satd8x8[2][i] += i_chroma_cost;
        }

        a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
        a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];

        i_part_cost = a->l0.me8x8[i].cost;
        h->mb.i_sub_partition[i] = D_L0_8x8;
        COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
        COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
        COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
        a->i_cost8x8bi += i_part_cost;

        /* XXX Needed for x264_mb_predict_mv */
        x264_mb_cache_mv_b8x8( h, a, i, 0 );
    }

    /* mb type cost */
    a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
}

static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
{
    pixel **p_fref[2] =
        { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
          h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
    ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );

    /* XXX Needed for x264_mb_predict_mv */
    h->mb.i_partition = D_8x8;

    a->i_cost8x8bi = 0;

    for( int i = 0; i < 4; i++ )
    {
        int x8 = i&1;
        int y8 = i>>1;
        int i_part_cost;
        int i_part_cost_bi = 0;
        int stride[2] = {8,8};
        pixel *src[2];

        for( int l = 0; l < 2; l++ )
        {
            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
            x264_me_t *m = &lX->me8x8[i];
            m->i_pixel = PIXEL_8x8;
            LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );

            m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
            m->i_ref = lX->me16x16.i_ref;

            LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );

            x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
            x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
            x264_me_search( h, m, &lX->me16x16.mv, 1 );
            a->i_satd8x8[l][i] = m->cost - m->cost_mv;
            m->cost += m->i_ref_cost;

            x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );

            /* save mv for predicting other partitions within this MB */
            CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );

            /* BI mode */
            src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
                                    m->mv[0], m->mv[1], 8, 8, x264_weight_none );
            i_part_cost_bi += m->cost_mv + m->i_ref_cost;
        }
        h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
        a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
        i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
        a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
        a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];

        if( h->mb.b_chroma_me )
        {
            int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
            i_part_cost_bi += i_chroma_cost;
            a->i_satd8x8[2][i] += i_chroma_cost;
        }

        i_part_cost = a->l0.me8x8[i].cost;
        h->mb.i_sub_partition[i] = D_L0_8x8;
        COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
        COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
        COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
        a->i_cost8x8bi += i_part_cost;

        /* XXX Needed for x264_mb_predict_mv */
        x264_mb_cache_mv_b8x8( h, a, i, 0 );
    }

    /* mb type cost */
    a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
}

static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
    ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
    ALIGNED_4( int16_t mvc[3][2] );

    h->mb.i_partition = D_16x8;
    a->i_cost16x8bi = 0;

    for( int i = 0; i < 2; i++ )
    {
        int i_part_cost;
        int i_part_cost_bi = 0;
        int stride[2] = {16,16};
        pixel *src[2];
        x264_me_t m;
        m.i_pixel = PIXEL_16x8;
        LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );

        for( int l = 0; l < 2; l++ )
        {
            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
            int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
            int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
            lX->me16x8[i].cost = INT_MAX;
            for( int j = 0; j < i_ref8s; j++ )
            {
                int i_ref = ref8[j];
                m.i_ref_cost = REF_COST( l, i_ref );

                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );

                CP32( mvc[0], lX->mvc[i_ref][0] );
                CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
                CP32( mvc[2], lX->mvc[i_ref][2*i+2] );

                x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
                x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
                x264_me_search( h, &m, mvc, 3 );
                m.cost += m.i_ref_cost;

                if( m.cost < lX->me16x8[i].cost )
                    h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
            }
        }

        /* BI mode */
        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
                                a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
                                a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
        h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
                                h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );

        i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
                        + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
                        + a->l1.me16x8[i].i_ref_cost;

        if( h->mb.b_chroma_me )
            i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );

        i_part_cost = a->l0.me16x8[i].cost;
        a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */

        if( a->l1.me16x8[i].cost < i_part_cost )
        {
            i_part_cost = a->l1.me16x8[i].cost;
            a->i_mb_partition16x8[i] = D_L1_8x8;
        }
        if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
        {
            i_part_cost = i_part_cost_bi;
            a->i_mb_partition16x8[i] = D_BI_8x8;
        }
        a->i_cost16x8bi += i_part_cost;

        /* Early termination based on the current SATD score of partition[0]
           plus the estimated SATD score of partition[1] */
        if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
            * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
        {
            a->i_cost16x8bi = COST_MAX;
            return;
        }

        x264_mb_cache_mv_b16x8( h, a, i, 0 );
    }

    /* mb type cost */
    a->i_mb_type16x8 = B_L0_L0
        + (a->i_mb_partition16x8[0]>>2) * 3
        + (a->i_mb_partition16x8[1]>>2);
    a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
}

static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
    ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
    ALIGNED_4( int16_t mvc[3][2] );

    h->mb.i_partition = D_8x16;
    a->i_cost8x16bi = 0;

    for( int i = 0; i < 2; i++ )
    {
        int i_part_cost;
        int i_part_cost_bi = 0;
        int stride[2] = {8,8};
        pixel *src[2];
        x264_me_t m;
        m.i_pixel = PIXEL_8x16;
        LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );

        for( int l = 0; l < 2; l++ )
        {
            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
            int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
            int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
            lX->me8x16[i].cost = INT_MAX;
            for( int j = 0; j < i_ref8s; j++ )
            {
                int i_ref = ref8[j];
                m.i_ref_cost = REF_COST( l, i_ref );

                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );

                CP32( mvc[0], lX->mvc[i_ref][0] );
                CP32( mvc[1], lX->mvc[i_ref][i+1] );
                CP32( mvc[2], lX->mvc[i_ref][i+3] );

                x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
                x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
                x264_me_search( h, &m, mvc, 3 );
                m.cost += m.i_ref_cost;

                if( m.cost < lX->me8x16[i].cost )
                    h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
            }
        }

        /* BI mode */
        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
                                a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
                                a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
        h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );

        i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
                        + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
                        + a->l1.me8x16[i].i_ref_cost;

        if( h->mb.b_chroma_me )
            i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );

        i_part_cost = a->l0.me8x16[i].cost;
        a->i_mb_partition8x16[i] = D_L0_8x8;

        if( a->l1.me8x16[i].cost < i_part_cost )
        {
            i_part_cost = a->l1.me8x16[i].cost;
            a->i_mb_partition8x16[i] = D_L1_8x8;
        }
        if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
        {
            i_part_cost = i_part_cost_bi;
            a->i_mb_partition8x16[i] = D_BI_8x8;
        }
        a->i_cost8x16bi += i_part_cost;

        /* Early termination based on the current SATD score of partition[0]
           plus the estimated SATD score of partition[1] */
        if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
            * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
        {
            a->i_cost8x16bi = COST_MAX;
            return;
        }

        x264_mb_cache_mv_b8x16( h, a, i, 0 );
    }

    /* mb type cost */
    a->i_mb_type8x16 = B_L0_L0
        + (a->i_mb_partition8x16[0]>>2) * 3
        + (a->i_mb_partition8x16[1]>>2);
    a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
}

static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
{
    int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;

    h->mb.i_type = P_L0;
    if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
    {
        h->mb.i_partition = D_16x16;
        x264_analyse_update_cache( h, a );
        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
    }

    if( a->l0.i_cost16x8 < thresh )
    {
        h->mb.i_partition = D_16x8;
        x264_analyse_update_cache( h, a );
        a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
    }
    else
        a->l0.i_cost16x8 = COST_MAX;

    if( a->l0.i_cost8x16 < thresh )
    {
        h->mb.i_partition = D_8x16;
        x264_analyse_update_cache( h, a );
        a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
    }
    else
        a->l0.i_cost8x16 = COST_MAX;

    if( a->l0.i_cost8x8 < thresh )
    {
        h->mb.i_type = P_8x8;
        h->mb.i_partition = D_8x8;
        if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
        {
            x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
            x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
            x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
            x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
            /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
             * for future blocks are those left over from previous RDO calls. */
            for( int i = 0; i < 4; i++ )
            {
                int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
                int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
                int subtype, btype = D_L0_8x8;
                uint64_t bcost = COST_MAX64;
                for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
                {
                    uint64_t cost;
                    if( costs[subtype] > sub8x8_thresh )
                        continue;
                    h->mb.i_sub_partition[i] = subtype;
                    x264_mb_cache_mv_p8x8( h, a, i );
                    if( subtype == btype )
                        continue;
                    cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
                    COPY2_IF_LT( bcost, cost, btype, subtype );
                }
                if( h->mb.i_sub_partition[i] != btype )
                {
                    h->mb.i_sub_partition[i] = btype;
                    x264_mb_cache_mv_p8x8( h, a, i );
                }
            }
        }
        else
            x264_analyse_update_cache( h, a );
        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
    }
    else
        a->l0.i_cost8x8 = COST_MAX;
}

static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
{
    int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;

    if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
    {
        h->mb.i_type = B_DIRECT;
        /* Assumes direct/skip MC is still in fdec */
        /* Requires b-rdo to be done before intra analysis */
        h->mb.b_skip_mc = 1;
        x264_analyse_update_cache( h, a );
        a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
        h->mb.b_skip_mc = 0;
    }

    //FIXME not all the update_cache calls are needed
    h->mb.i_partition = D_16x16;
    /* L0 */
    if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
    {
        h->mb.i_type = B_L0_L0;
        x264_analyse_update_cache( h, a );
        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
    }

    /* L1 */
    if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
    {
        h->mb.i_type = B_L1_L1;
        x264_analyse_update_cache( h, a );
        a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
    }

    /* BI */
    if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
    {
        h->mb.i_type = B_BI_BI;
        x264_analyse_update_cache( h, a );
        a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
    }

    /* 8x8 */
    if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
    {
        h->mb.i_type = B_8x8;
        h->mb.i_partition = D_8x8;
        x264_analyse_update_cache( h, a );
        a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
        x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
    }

    /* 16x8 */
    if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
    {
        h->mb.i_type = a->i_mb_type16x8;
        h->mb.i_partition = D_16x8;
        x264_analyse_update_cache( h, a );
        a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
    }

    /* 8x16 */
    if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
    {
        h->mb.i_type = a->i_mb_type8x16;
        h->mb.i_partition = D_8x16;
        x264_analyse_update_cache( h, a );
        a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
    }
}

static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
{
    int i_biweight;

    if( IS_INTRA(h->mb.i_type) )
        return;

    switch( h->mb.i_partition )
    {
        case D_16x16:
            if( h->mb.i_type == B_BI_BI )
            {
                i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
                x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
            }
            break;
        case D_16x8:
            for( int i = 0; i < 2; i++ )
                if( a->i_mb_partition16x8[i] == D_BI_8x8 )
                {
                    i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
                    x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
                }
            break;
        case D_8x16:
            for( int i = 0; i < 2; i++ )
                if( a->i_mb_partition8x16[i] == D_BI_8x8 )
                {
                    i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
                    x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
                }
            break;
        case D_8x8:
            for( int i = 0; i < 4; i++ )
                if( h->mb.i_sub_partition[i] == D_BI_8x8 )
                {
                    i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
                    x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
                }
            break;
    }
}

static inline void x264_mb_analyse_transform( x264_t *h )
{
    if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
    {
        /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
        x264_mb_mc( h );

        int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
        int i_cost8 = 0, i_cost4 = 0;
        for( int p = 0; p < plane_count; p++ )
        {
            i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
                                                  h->mb.pic.p_fdec[p], FDEC_STRIDE );
            i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
                                                  h->mb.pic.p_fdec[p], FDEC_STRIDE );
        }

        h->mb.b_transform_8x8 = i_cost8 < i_cost4;
        h->mb.b_skip_mc = 1;
    }
}

static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
{
    if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
    {
        x264_analyse_update_cache( h, a );
        h->mb.b_transform_8x8 ^= 1;
        /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
        int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );

        if( *i_rd >= i_rd8 )
        {
            if( *i_rd > 0 )
                *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
            *i_rd = i_rd8;
        }
        else
            h->mb.b_transform_8x8 ^= 1;
    }
}

/* Rate-distortion optimal QP selection.
 * FIXME: More than half of the benefit of this function seems to be
 * in the way it improves the coding of chroma DC (by decimating or
 * finding a better way to code a single DC coefficient.)
 * There must be a more efficient way to get that portion of the benefit
 * without doing full QP-RD, but RD-decimation doesn't seem to do the
 * trick. */
static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
{
    int bcost, cost, failures, prevcost, origcost;
    int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
    int last_qp_tried = 0;
    origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
    int origcbp = h->mb.cbp[h->mb.i_mb_xy];

    /* If CBP is already zero, don't raise the quantizer any higher. */
    for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
    {
        /* Without psy-RD, require monotonicity when moving quant away from previous
         * macroblock's quant; allow 1 failure when moving quant towards previous quant.
         * With psy-RD, allow 1 failure when moving quant away from previous quant,
         * allow 2 failures when moving quant towards previous quant.
         * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
        int threshold = (!!h->mb.i_psy_rd);
        /* Raise the threshold for failures if we're moving towards the last QP. */
        if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
            ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
            threshold++;
        h->mb.i_qp = orig_qp;
        failures = 0;
        prevcost = origcost;

        /* If the current QP results in an empty CBP, it's highly likely that lower QPs
         * (up to a point) will too.  So, jump down to where the threshold will kick in
         * and check the QP there.  If the CBP is still empty, skip the main loop.
         * If it isn't empty, we would have ended up having to check this QP anyways,
         * so as long as we store it for later lookup, we lose nothing. */
        int already_checked_qp = -1;
        int already_checked_cost = COST_MAX;
        if( direction == -1 )
        {
            if( !origcbp )
            {
                h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
                h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
                already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
                if( !h->mb.cbp[h->mb.i_mb_xy] )
                {
                    /* If our empty-CBP block is lower QP than the last QP,
                     * the last QP almost surely doesn't have a CBP either. */
                    if( h->mb.i_last_qp > h->mb.i_qp )
                        last_qp_tried = 1;
                    break;
                }
                already_checked_qp = h->mb.i_qp;
                h->mb.i_qp = orig_qp;
            }
        }

        h->mb.i_qp += direction;
        while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
        {
            if( h->mb.i_last_qp == h->mb.i_qp )
                last_qp_tried = 1;
            if( h->mb.i_qp == already_checked_qp )
                cost = already_checked_cost;
            else
            {
                h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
                cost = x264_rd_cost_mb( h, a->i_lambda2 );
                COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
            }

            /* We can't assume that the costs are monotonic over QPs.
             * Tie case-as-failure seems to give better results. */
            if( cost < prevcost )
                failures = 0;
            else
                failures++;
            prevcost = cost;

            if( failures > threshold )
                break;
            if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
                break;
            h->mb.i_qp += direction;
        }
    }

    /* Always try the last block's QP. */
    if( !last_qp_tried )
    {
        h->mb.i_qp = h->mb.i_last_qp;
        h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
        cost = x264_rd_cost_mb( h, a->i_lambda2 );
        COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
    }

    h->mb.i_qp = bqp;
    h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];

    /* Check transform again; decision from before may no longer be optimal. */
    if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
        x264_mb_transform_8x8_allowed( h ) )
    {
        h->mb.b_transform_8x8 ^= 1;
        cost = x264_rd_cost_mb( h, a->i_lambda2 );
        if( cost > bcost )
            h->mb.b_transform_8x8 ^= 1;
    }
}

/*****************************************************************************
 * x264_macroblock_analyse:
 *****************************************************************************/
void x264_macroblock_analyse( x264_t *h )
{
    x264_mb_analysis_t analysis;
    int i_cost = COST_MAX;

    h->mb.i_qp = x264_ratecontrol_mb_qp( h );
    /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
     * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
    if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
        h->mb.i_qp = h->mb.i_last_qp;

    x264_mb_analyse_init( h, &analysis, h->mb.i_qp );

    /*--------------------------- Do the analysis ---------------------------*/
    if( h->sh.i_type == SLICE_TYPE_I )
    {
intra_analysis:
        if( analysis.i_mbrd )
            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
        x264_mb_analyse_intra( h, &analysis, COST_MAX );
        if( analysis.i_mbrd )
            x264_intra_rd( h, &analysis, COST_MAX );

        i_cost = analysis.i_satd_i16x16;
        h->mb.i_type = I_16x16;
        COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
        COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
        if( analysis.i_satd_pcm < i_cost )
            h->mb.i_type = I_PCM;

        else if( analysis.i_mbrd >= 2 )
            x264_intra_rd_refine( h, &analysis );
    }
    else if( h->sh.i_type == SLICE_TYPE_P )
    {
        int b_skip = 0;

        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );

        analysis.b_try_skip = 0;
        if( analysis.b_force_intra )
        {
            if( !h->param.analyse.b_psy )
            {
                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
                goto intra_analysis;
            }
        }
        else
        {
            int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
            /* If the current macroblock is off the frame, just skip it. */
            if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
                b_skip = 1;
            /* Fast P_SKIP detection */
            else if( h->param.analyse.b_fast_pskip )
            {
                if( skip_invalid )
                    // FIXME don't need to check this if the reference frame is done
                    {}
                else if( h->param.analyse.i_subpel_refine >= 3 )
                    analysis.b_try_skip = 1;
                else if( h->mb.i_mb_type_left[0] == P_SKIP ||
                         h->mb.i_mb_type_top == P_SKIP ||
                         h->mb.i_mb_type_topleft == P_SKIP ||
                         h->mb.i_mb_type_topright == P_SKIP )
                    b_skip = x264_macroblock_probe_pskip( h );
            }
        }

        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );

        if( b_skip )
        {
            h->mb.i_type = P_SKIP;
            h->mb.i_partition = D_16x16;
            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
            /* Set up MVs for future predictors */
            for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
                M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
        }
        else
        {
            const unsigned int flags = h->param.analyse.inter;
            int i_type;
            int i_partition;
            int i_satd_inter, i_satd_intra;

            x264_mb_analyse_load_costs( h, &analysis );

            x264_mb_analyse_inter_p16x16( h, &analysis );

            if( h->mb.i_type == P_SKIP )
            {
                for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
                return;
            }

            if( flags & X264_ANALYSE_PSUB16x16 )
            {
                if( h->param.analyse.b_mixed_references )
                    x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
                else
                    x264_mb_analyse_inter_p8x8( h, &analysis );
            }

            /* Select best inter mode */
            i_type = P_L0;
            i_partition = D_16x16;
            i_cost = analysis.l0.me16x16.cost;

            if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
            {
                i_type = P_8x8;
                i_partition = D_8x8;
                i_cost = analysis.l0.i_cost8x8;

                /* Do sub 8x8 */
                if( flags & X264_ANALYSE_PSUB8x8 )
                {
                    for( int i = 0; i < 4; i++ )
                    {
                        x264_mb_analyse_inter_p4x4( h, &analysis, i );
                        int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
                        if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
                        {
                            int i_cost8x8 = analysis.l0.i_cost4x4[i];
                            h->mb.i_sub_partition[i] = D_L0_4x4;

                            x264_mb_analyse_inter_p8x4( h, &analysis, i );
                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
                                         h->mb.i_sub_partition[i], D_L0_8x4 );

                            x264_mb_analyse_inter_p4x8( h, &analysis, i );
                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
                                         h->mb.i_sub_partition[i], D_L0_4x8 );

                            i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
                        }
                        x264_mb_cache_mv_p8x8( h, &analysis, i );
                    }
                    analysis.l0.i_cost8x8 = i_cost;
                }
            }

            /* Now do 16x8/8x16 */
            int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
            if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
            {
                int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
                                      + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
                analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;

                x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
                COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );

                i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
                                  + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
                analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;

                x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
            }

            h->mb.i_partition = i_partition;

            /* refine qpel */
            //FIXME mb_type costs?
            if( analysis.i_mbrd || !h->mb.i_subpel_refine )
            {
                /* refine later */
            }
            else if( i_partition == D_16x16 )
            {
                x264_me_refine_qpel( h, &analysis.l0.me16x16 );
                i_cost = analysis.l0.me16x16.cost;
            }
            else if( i_partition == D_16x8 )
            {
                x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
                x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
                i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
            }
            else if( i_partition == D_8x16 )
            {
                x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
                x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
                i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
            }
            else if( i_partition == D_8x8 )
            {
                i_cost = 0;
                for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                {
                    switch( h->mb.i_sub_partition[i8x8] )
                    {
                        case D_L0_8x8:
                            x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
                            i_cost += analysis.l0.me8x8[i8x8].cost;
                            break;
                        case D_L0_8x4:
                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
                            i_cost += analysis.l0.me8x4[i8x8][0].cost +
                                      analysis.l0.me8x4[i8x8][1].cost;
                            break;
                        case D_L0_4x8:
                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
                            i_cost += analysis.l0.me4x8[i8x8][0].cost +
                                      analysis.l0.me4x8[i8x8][1].cost;
                            break;

                        case D_L0_4x4:
                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
                            i_cost += analysis.l0.me4x4[i8x8][0].cost +
                                      analysis.l0.me4x4[i8x8][1].cost +
                                      analysis.l0.me4x4[i8x8][2].cost +
                                      analysis.l0.me4x4[i8x8][3].cost;
                            break;
                        default:
                            x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
                            break;
                    }
                }
            }

            if( h->mb.b_chroma_me )
            {
                if( CHROMA444 )
                {
                    x264_mb_analyse_intra( h, &analysis, i_cost );
                    x264_mb_analyse_intra_chroma( h, &analysis );
                }
                else
                {
                    x264_mb_analyse_intra_chroma( h, &analysis );
                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
                }
                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
            }
            else
                x264_mb_analyse_intra( h, &analysis, i_cost );

            i_satd_inter = i_cost;
            i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
                                      analysis.i_satd_i8x8,
                                      analysis.i_satd_i4x4 );

            if( analysis.i_mbrd )
            {
                x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
                i_type = P_L0;
                i_partition = D_16x16;
                i_cost = analysis.l0.i_rd16x16;
                COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
                COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
                h->mb.i_type = i_type;
                h->mb.i_partition = i_partition;
                if( i_cost < COST_MAX )
                    x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
                x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
            }

            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
            COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );

            h->mb.i_type = i_type;

            if( analysis.b_force_intra && !IS_INTRA(i_type) )
            {
                /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
                 * it was an inter block. */
                x264_analyse_update_cache( h, &analysis );
                x264_macroblock_encode( h );
                for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
                    h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
                if( !CHROMA444 )
                {
                    int height = 16 >> CHROMA_V_SHIFT;
                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
                }
                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
                goto intra_analysis;
            }

            if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
            {
                if( IS_INTRA( h->mb.i_type ) )
                {
                    x264_intra_rd_refine( h, &analysis );
                }
                else if( i_partition == D_16x16 )
                {
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
                    analysis.l0.me16x16.cost = i_cost;
                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
                }
                else if( i_partition == D_16x8 )
                {
                    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
                    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
                }
                else if( i_partition == D_8x16 )
                {
                    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
                    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
                }
                else if( i_partition == D_8x8 )
                {
                    x264_analyse_update_cache( h, &analysis );
                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                    {
                        if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
                        {
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
                        }
                        else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
                        {
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
                        }
                        else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
                        {
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
                        }
                        else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
                        {
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
                        }
                    }
                }
            }
        }
    }
    else if( h->sh.i_type == SLICE_TYPE_B )
    {
        int i_bskip_cost = COST_MAX;
        int b_skip = 0;

        if( analysis.i_mbrd )
            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );

        h->mb.i_type = B_SKIP;
        if( h->mb.b_direct_auto_write )
        {
            /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
            for( int i = 0; i < 2; i++ )
            {
                int b_changed = 1;
                h->sh.b_direct_spatial_mv_pred ^= 1;
                analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
                if( analysis.b_direct_available )
                {
                    if( b_changed )
                    {
                        x264_mb_mc( h );
                        b_skip = x264_macroblock_probe_bskip( h );
                    }
                    h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
                }
                else
                    b_skip = 0;
            }
        }
        else
            analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );

        analysis.b_try_skip = 0;
        if( analysis.b_direct_available )
        {
            if( !h->mb.b_direct_auto_write )
                x264_mb_mc( h );
            /* If the current macroblock is off the frame, just skip it. */
            if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
                b_skip = 1;
            else if( analysis.i_mbrd )
            {
                i_bskip_cost = ssd_mb( h );
                /* 6 = minimum cavlc cost of a non-skipped MB */
                b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
            }
            else if( !h->mb.b_direct_auto_write )
            {
                /* Conditioning the probe on neighboring block types
                 * doesn't seem to help speed or quality. */
                analysis.b_try_skip = x264_macroblock_probe_bskip( h );
                if( h->param.analyse.i_subpel_refine < 3 )
                    b_skip = analysis.b_try_skip;
            }
            /* Set up MVs for future predictors */
            if( b_skip )
            {
                for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
                for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
                    M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
            }
        }

        if( !b_skip )
        {
            const unsigned int flags = h->param.analyse.inter;
            int i_type;
            int i_partition;
            int i_satd_inter;
            h->mb.b_skip_mc = 0;
            h->mb.i_type = B_DIRECT;

            x264_mb_analyse_load_costs( h, &analysis );

            /* select best inter mode */
            /* direct must be first */
            if( analysis.b_direct_available )
                x264_mb_analyse_inter_direct( h, &analysis );

            x264_mb_analyse_inter_b16x16( h, &analysis );

            if( h->mb.i_type == B_SKIP )
            {
                for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
                for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
                    M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
                return;
            }

            i_type = B_L0_L0;
            i_partition = D_16x16;
            i_cost = analysis.l0.me16x16.cost;
            COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
            COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
            COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );

            if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
            {
                x264_mb_analyse_b_rd( h, &analysis, i_cost );
                if( i_bskip_cost < analysis.i_rd16x16direct &&
                    i_bskip_cost < analysis.i_rd16x16bi &&
                    i_bskip_cost < analysis.l0.i_rd16x16 &&
                    i_bskip_cost < analysis.l1.i_rd16x16 )
                {
                    h->mb.i_type = B_SKIP;
                    x264_analyse_update_cache( h, &analysis );
                    return;
                }
            }

            if( flags & X264_ANALYSE_BSUB16x16 )
            {
                if( h->param.analyse.b_mixed_references )
                    x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
                else
                    x264_mb_analyse_inter_b8x8( h, &analysis );

                COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );

                /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
                int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
                int i_mb_type, i_partition16x8[2], i_partition8x16[2];
                for( int i = 0; i < 2; i++ )
                {
                    int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
                    int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
                    // 16x8
                    i_best_cost = COST_MAX;
                    i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
                    i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
                    i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
                    avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
                                         + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
                    avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
                                         + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
                    COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
                    COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
                    COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
                    analysis.i_cost_est16x8[i] = i_best_cost;

                    // 8x16
                    i_best_cost = COST_MAX;
                    i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
                    i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
                    i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
                    avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
                                         + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
                    avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
                                         + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
                    COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
                    COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
                    COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
                    analysis.i_cost_est8x16[i] = i_best_cost;
                }
                i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
                analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
                i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
                i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
                analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
                i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];

                /* We can gain a little speed by checking the mode with the lowest estimated cost first */
                int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
                if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
                {
                    x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
                    COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
                }
                if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
                {
                    x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
                    COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
                }
                if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
                {
                    x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
                    COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
                }
            }

            if( analysis.i_mbrd || !h->mb.i_subpel_refine )
            {
                /* refine later */
            }
            /* refine qpel */
            else if( i_partition == D_16x16 )
            {
                analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
                analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
                if( i_type == B_L0_L0 )
                {
                    x264_me_refine_qpel( h, &analysis.l0.me16x16 );
                    i_cost = analysis.l0.me16x16.cost
                           + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
                }
                else if( i_type == B_L1_L1 )
                {
                    x264_me_refine_qpel( h, &analysis.l1.me16x16 );
                    i_cost = analysis.l1.me16x16.cost
                           + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
                }
                else if( i_type == B_BI_BI )
                {
                    x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
                    x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
                }
            }
            else if( i_partition == D_16x8 )
            {
                for( int i = 0; i < 2; i++ )
                {
                    if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
                        x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
                    if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
                        x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
                }
            }
            else if( i_partition == D_8x16 )
            {
                for( int i = 0; i < 2; i++ )
                {
                    if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
                        x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
                    if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
                        x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
                }
            }
            else if( i_partition == D_8x8 )
            {
                for( int i = 0; i < 4; i++ )
                {
                    x264_me_t *m;
                    int i_part_cost_old;
                    int i_type_cost;
                    int i_part_type = h->mb.i_sub_partition[i];
                    int b_bidir = (i_part_type == D_BI_8x8);

                    if( i_part_type == D_DIRECT_8x8 )
                        continue;
                    if( x264_mb_partition_listX_table[0][i_part_type] )
                    {
                        m = &analysis.l0.me8x8[i];
                        i_part_cost_old = m->cost;
                        i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
                        m->cost -= i_type_cost;
                        x264_me_refine_qpel( h, m );
                        if( !b_bidir )
                            analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
                    }
                    if( x264_mb_partition_listX_table[1][i_part_type] )
                    {
                        m = &analysis.l1.me8x8[i];
                        i_part_cost_old = m->cost;
                        i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
                        m->cost -= i_type_cost;
                        x264_me_refine_qpel( h, m );
                        if( !b_bidir )
                            analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
                    }
                    /* TODO: update mvp? */
                }
            }

            i_satd_inter = i_cost;

            if( analysis.i_mbrd )
            {
                x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
                i_type = B_SKIP;
                i_cost = i_bskip_cost;
                i_partition = D_16x16;
                COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
                COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
                COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
                COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
                COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
                COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
                COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );

                h->mb.i_type = i_type;
                h->mb.i_partition = i_partition;
            }

            if( h->mb.b_chroma_me )
            {
                if( CHROMA444 )
                {
                    x264_mb_analyse_intra( h, &analysis, i_satd_inter );
                    x264_mb_analyse_intra_chroma( h, &analysis );
                }
                else
                {
                    x264_mb_analyse_intra_chroma( h, &analysis );
                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
                }
                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
            }
            else
                x264_mb_analyse_intra( h, &analysis, i_satd_inter );

            if( analysis.i_mbrd )
            {
                x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
                x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
            }

            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
            COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );

            h->mb.i_type = i_type;
            h->mb.i_partition = i_partition;

            if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
                x264_intra_rd_refine( h, &analysis );
            if( h->mb.i_subpel_refine >= 5 )
                x264_refine_bidir( h, &analysis );

            if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
            {
                int i_biweight;
                x264_analyse_update_cache( h, &analysis );

                if( i_partition == D_16x16 )
                {
                    if( i_type == B_L0_L0 )
                    {
                        analysis.l0.me16x16.cost = i_cost;
                        x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
                    }
                    else if( i_type == B_L1_L1 )
                    {
                        analysis.l1.me16x16.cost = i_cost;
                        x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
                    }
                    else if( i_type == B_BI_BI )
                    {
                        i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
                        x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
                    }
                }
                else if( i_partition == D_16x8 )
                {
                    for( int i = 0; i < 2; i++ )
                    {
                        h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
                        if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
                            x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
                        else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
                            x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
                        else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
                        {
                            i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
                            x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
                        }
                    }
                }
                else if( i_partition == D_8x16 )
                {
                    for( int i = 0; i < 2; i++ )
                    {
                        h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
                        if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
                        else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
                            x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
                        else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
                        {
                            i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
                            x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
                        }
                    }
                }
                else if( i_partition == D_8x8 )
                {
                    for( int i = 0; i < 4; i++ )
                    {
                        if( h->mb.i_sub_partition[i] == D_L0_8x8 )
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
                        else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
                            x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
                        else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
                        {
                            i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
                            x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
                        }
                    }
                }
            }
        }
    }

    x264_analyse_update_cache( h, &analysis );

    /* In rare cases we can end up qpel-RDing our way back to a larger partition size
     * without realizing it.  Check for this and account for it if necessary. */
    if( analysis.i_mbrd >= 2 )
    {
        /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
        static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
        int list = check_mv_lists[h->mb.i_type] - 1;
        if( list >= 0 && h->mb.i_partition != D_16x16 &&
            M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
            h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
                h->mb.i_partition = D_16x16;
    }

    if( !analysis.i_mbrd )
        x264_mb_analyse_transform( h );

    if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
        x264_mb_analyse_qp_rd( h, &analysis );

    h->mb.b_trellis = h->param.analyse.i_trellis;
    h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));

    if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
        x264_psy_trellis_init( h, 0 );
    if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
        h->mb.i_skip_intra = 0;
}

/*-------------------- Update MB from the analysis ----------------------*/
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
{
    switch( h->mb.i_type )
    {
        case I_4x4:
            for( int i = 0; i < 16; i++ )
                h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];

            x264_mb_analyse_intra_chroma( h, a );
            break;
        case I_8x8:
            for( int i = 0; i < 4; i++ )
                x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );

            x264_mb_analyse_intra_chroma( h, a );
            break;
        case I_16x16:
            h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
            x264_mb_analyse_intra_chroma( h, a );
            break;

        case I_PCM:
            break;

        case P_L0:
            switch( h->mb.i_partition )
            {
                case D_16x16:
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
                    break;

                case D_16x8:
                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
                    x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
                    break;

                case D_8x16:
                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
                    x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
                    break;

                default:
                    x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
                    break;
            }
            break;

        case P_8x8:
            x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
            x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
            x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
            x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
            for( int i = 0; i < 4; i++ )
                x264_mb_cache_mv_p8x8( h, a, i );
            break;

        case P_SKIP:
        {
            h->mb.i_partition = D_16x16;
            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
            break;
        }

        case B_SKIP:
        case B_DIRECT:
            h->mb.i_partition = h->mb.cache.direct_partition;
            x264_mb_load_mv_direct8x8( h, 0 );
            x264_mb_load_mv_direct8x8( h, 1 );
            x264_mb_load_mv_direct8x8( h, 2 );
            x264_mb_load_mv_direct8x8( h, 3 );
            break;

        case B_8x8:
            /* optimize: cache might not need to be rewritten */
            for( int i = 0; i < 4; i++ )
                x264_mb_cache_mv_b8x8( h, a, i, 1 );
            break;

        default: /* the rest of the B types */
            switch( h->mb.i_partition )
            {
            case D_16x16:
                switch( h->mb.i_type )
                {
                case B_L0_L0:
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );

                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
                    break;
                case B_L1_L1:
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );

                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
                    break;
                case B_BI_BI:
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );

                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
                    break;
                }
                break;
            case D_16x8:
                x264_mb_cache_mv_b16x8( h, a, 0, 1 );
                x264_mb_cache_mv_b16x8( h, a, 1, 1 );
                break;
            case D_8x16:
                x264_mb_cache_mv_b8x16( h, a, 0, 1 );
                x264_mb_cache_mv_b8x16( h, a, 1, 1 );
                break;
            default:
                x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
                break;
            }
    }

#ifndef NDEBUG
    if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
    {
        for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
        {
            int completed;
            int ref = h->mb.cache.ref[l][x264_scan8[0]];
            if( ref < 0 )
                continue;
            completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
            if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
            {
                x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
                x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
                x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
                                h->mb.cache.mv[l][x264_scan8[15]][0],
                                h->mb.cache.mv[l][x264_scan8[15]][1] );
                x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
                x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
                x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
                x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
                x264_mb_analyse_intra( h, a, COST_MAX );
                h->mb.i_type = I_16x16;
                h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
                x264_mb_analyse_intra_chroma( h, a );
            }
        }
    }
#endif
}

#include "slicetype.c"

x264-snapshot-20120103-2245-stable/doc/0000755000175000001440000000000011700673342016436 5ustar  videolanusersx264-snapshot-20120103-2245-stable/doc/vui.txt0000644000175000001440000002100311700673342017776 0ustar  videolanusersVideo Usability Information (VUI) Guide
by Christian Heine ( sennindemokrit at gmx dot net )

1. Sample Aspect Ratio
-----------------------

* What is it?
    The Sample Aspect Ratio (SAR) (sometimes called Pixel Aspect Ratio or just
    Pel Aspect Ratio) is defined as the ratio of the width of the sample to the
    height of the sample. While pixels on a computer monitor generally are
    "square" meaning that their SAR is 1:1, digitized video usually has rather
    odd SARs. Playback of material with a particular SAR on a system with
    a different SAR will result in a stretched/squashed image. A correction is
    necessary that relies on the knowledge of both SARs.

* How do I use it?
    You can derive the SAR of an image from the width, height and the
    display aspect ratio (DAR) of the image as follows:
    
    SAR_x   DAR_x * height
    ----- = --------------
    SAR_y   DAR_y * width
    
    for example:
    width x height = 704x576, DAR = 4:3 ==> SAR = 2304:2112 or 12:11
    
    Please note that if your material is a digitized analog signal, you should
    not use this equation to calculate the SAR. Refer to the manual of your
    digitizing equipment or this link instead.

    A Quick Guide to Digital Video Resolution and Aspect Ratio Conversions
    http://www.iki.fi/znark/video/conversion/

* Should I use this option?
    In one word: yes. Most decoders/ media players nowadays support automatic
    correction of aspect ratios, and there are just few exceptions. You should
    even use it, if the SAR of your material is 1:1, as the default of x264 is
    "SAR not defined".
    
2. Overscan
------------

* What is it?
    The term overscan generally refers to all regions of an image that do
    not contain information but are added to achieve a certain resolution or
    aspect ratio. A "letterboxed" image therefore has overscan at the top and
    the bottom. This is not the overscan this option refers to. Neither refers
    it to the overscan that is added as part of the process of digitizing an
    analog signal. Instead it refers to the "overscan" process on a display
    that shows only a part of the image. What that part is depends on the
    display.
    
* How do I use this option?
    As I'm not sure about what part of the image is shown when the display uses
    an overscan process, I can't provide you with rules or examples. The safe
    assumption would be "overscan=show" as this always shows the whole image.
    Use "overscan=crop" only if you are sure about the consequences. You may
    also use the default value ("undefined").

* Should I use this option?
    Only if you know exactly what you are doing. Don't use it on video streams
    that have general overscan. Instead try to to crop the borders before
    encoding and benefit from the higher bitrate/ image quality.

    Furthermore the H264 specification says that the setting "overscan=show"
    must be respected, but "overscan=crop" may be ignored. In fact most
    playback equipment ignores this setting and shows the whole image.

3. Video Format
----------------

* What is it?
    A purely informative setting, that explains what the type of your analog
    video was, before you digitized it.
    
* How do I use this option?
    Just set it to the desired value. ( e.g. NTSC, PAL )
    If you transcode from MPEG2, you may find the value for this option in the
    m2v bitstream. (see ITU-T Rec. H262 / ISO/IEC 13818-2 for details)

* Should I use this option?
    That is entirely up to you. I have no idea how this information would ever
    be relevant. I consider it to be informative only.

4. Full Range
--------------

* What is it?
    Another relic from digitizing analog video. When digitizing analog video
    the digital representation of the luma and chroma levels is limited to lie
    within 16..235 and 16..240 respectively. Playback equipment usually assumes
    all digitized samples to be within this range. However most DVDs use the
    full range of 0..255 for luma and chroma samples, possibly resulting in an
    oversaturation when played back on that equipment. To avoid this a range
    correction is needed.

* How do I use this option?
    If your source material is a digitized analog video/TV broadcast it is
    quite possible that it is range limited. If you can make sure that it is
    range limited you can safely set full range to off. If you are not sure
    or want to make sure that your material is played back without
    oversaturation, set if to on. Please note that the default for this option
    in x264 is off, which is not a safe assumption.
    
* Should I use this option?
    Yes, but there are few decoders/ media players that distinguish
    between the two options.
    
5. Color Primaries, Transfer Characteristics, Matrix Coefficients
-------------------------------------------------------------------

* What is it?
    A videophile setting. The average users won't ever need it.
    Not all monitor models show all colors the same way. When comparing the
    same image on two different monitor models you might find that one of them
    "looks more blue", while the other "looks more green". Bottom line is, each
    monitor model has a different color profile, which can be used to correct
    colors in a way, that images look almost the same on all monitors. The same
    goes for printers and film/ video digitizing equipment. If the color
    profile of the digitizing equipment is known, it is possible to correct the
    colors and gamma of the decoded h264 stream in a way that the video stream
    looks the same, regardless of the digitizing equipment used.
    
* How do I use these options?
    If you are able to find out which characteristics your digitizing equipment
    uses, (see the equipment documentation or make reference measurements)
    then find the most suitable characteristics in the list of available
    characteristics (see H264 Annex E) and pass it to x264. Otherwise leave it
    to the default (unspecified).
    If you transcode from MPEG2, you may find the values for these options in
    the m2v bitstream. (see ITU-T Rec. H262 / ISO/IEC 13818-2 for details)

* Should I use these options?
    Only if you know exactly what you are doing. The default setting is better
    than a wrong one. Use of this option is not a bad idea though.
    Unfortunately I don't know any decoder/ media player that ever even
    attempted color/gamma/color matrix correction.

6. Chroma Sample Location
--------------------------

* What is it?
    A videophile setting. The average user won't ever notice a difference.
    Due to a weakness of the eye, it is often economic to reduce the number of
    chroma samples in a process called subsampling. In particular x264 uses
    only one chroma sample of each chroma channel every block of 2x2 luma
    samples. There are a number of possibilities on how this subsampling is
    done, each resulting in another relative location of the chroma sample
    towards the luma samples. The Chroma Sample Location matters when the
    subsampling process is reversed, e.g. the number of chroma samples is
    increased. This is most likely to happen at color space conversions. If it
    is not done correctly the chroma values may appear shifted compared to the
    luma samples by at most 1 pixel, or strangely blurred.

* How do I use this option?
    Because x264 does no subsampling, since it only accepts already subsampled
    input frames, you have to determine the method yourself.

    If you transcode from MPEG1 with proper subsampled 4:2:0, and don't do any
    color space conversion, you should set this option to 1.
    If you transcode from MPEG2 with proper subsampled 4:2:0, and don't do any
    color space conversion, you should set this option to 0.
    If you transcode from MPEG4 with proper subsampled 4:2:0, and don't do any
    color space conversion, you should set this option to 0.

    If you do the color space conversion yourself this isn't that easy. If the
    filter kernel of the subsampling is ( 0.5, 0.5 ) in one direction then the
    chroma sample location in that direction is between the two luma samples.
    If your filter kernel is ( 0.25, 0.5, 0.25 ) in one direction then the
    chroma sample location in that direction is equal to one of the luma
    samples. H264 Annex E contains images that tell you how to "transform" your
    Chroma Sample Location into a value of 0 to 5 that you can pass to x264.
    
* Should I use this option?
    Unless you are a perfectionist, don't bother. Media players ignore this
    setting, and favor their own (fixed) assumed Chroma Sample Location.


x264-snapshot-20120103-2245-stable/doc/threads.txt0000644000175000001440000001241211700673342020631 0ustar  videolanusersHistorical notes:
Slice-based threads was the original threading model of x264.  It was replaced with frame-based threads in r607.  This document was originally written at that time.  Slice-based threading was brought back (as an optional mode) in r1364 for low-latency encoding.  Furthermore, frame-based threading was modified significantly in r1246, with the addition of threaded lookahead.

Old threading method: slice-based
application calls x264
x264 runs B-adapt and ratecontrol (serial)
split frame into several slices, and spawn a thread for each slice
wait until all threads are done
deblock and hpel filter (serial)
return to application
In x264cli, there is one additional thread to decode the input.

New threading method: frame-based
application calls x264
x264 requests a frame from lookahead, which runs B-adapt and ratecontrol parallel to the current thread, separated by a buffer of size sync-lookahead
spawn a thread for this frame
thread runs encode, deblock, hpel filter
meanwhile x264 waits for the oldest thread to finish
return to application, but the rest of the threads continue running in the background
No additional threads are needed to decode the input, unless decoding is slower than slice+deblock+hpel, in which case an additional input thread would allow decoding in parallel.

Penalties for slice-based threading:
Each slice adds some bitrate (or equivalently reduces quality), for a variety of reasons: the slice header costs some bits, cabac contexts are reset, mvs and intra samples can't be predicted across the slice boundary.
In CBR mode, multiple slices encode simultaneously, thus increasing the maximum misprediction possible with VBV.
Some parts of the encoder are serial, so it doesn't scale well with lots of cpus.

Some numbers on penalties for slicing:
Tested at 720p with 45 slices (one per mb row) to maximize the total cost for easy measurement. Averaged over 4 movies at crf20 and crf30. Total cost: +30% bitrate at constant psnr.
I enabled the various components of slicing one at a time, and measured the portion of that cost they contribute:
    * 34% intra prediction
    * 25% redundant slice headers, nal headers, and rounding to whole bytes
    * 16% mv prediction
    * 16% reset cabac contexts
    * 6% deblocking between slices (you don't strictly have to turn this off just for standard compliance, but you do if you want to use slices for decoder multithreading)
    * 2% cabac neighbors (cbp, skip, etc)
The proportional cost of redundant headers should certainly depend on bitrate (since the header size is constant and everything else depends on bitrate). Deblocking should too (due to varing deblock strength).
But none of the proportions should depend strongly on the number of slices: some are triggered per slice while some are triggered per macroblock-that's-on-the-edge-of-a-slice, but as long as there's no more than 1 slice per row, the relative frequency of those two conditions is determined solely by the image width.


Penalties for frame-base threading:
To allow encoding of multiple frames in parallel, we have to ensure that any given macroblock uses motion vectors only from pieces of the reference frames that have been encoded already. This is usually not noticeable, but can matter for very fast upward motion.
We have to commit to one frame type before starting on the frame. Thus scenecut detection must run during the lowres pre-motion-estimation along with B-adapt, which makes it faster but less accurate than re-encoding the whole frame.
Ratecontrol gets delayed feedback, since it has to plan frame N before frame N-1 finishes.

Benchmarks:
cpu: 8core Nehalem (2x E5520) 2.27GHz, hyperthreading disabled
kernel: linux 2.6.34.7, 64-bit
x264: r1732 b20059aa
input: http://media.xiph.org/video/derf/y4m/1080p/park_joy_1080p.y4m

NOTE: the "thread count" listed below does not count the lookahead thread, only encoding threads.  This is why for "veryfast", the speedup for 2 and 3 threads exceeds the logical limit.

threads  speedup       psnr
      slice frame   slice  frame
x264 --preset veryfast --tune psnr --crf 30
 1:   1.00x 1.00x  +0.000 +0.000
 2:   1.41x 2.29x  -0.005 -0.002
 3:   1.70x 3.65x  -0.035 +0.000
 4:   1.96x 3.97x  -0.029 -0.001
 5:   2.10x 3.98x  -0.047 -0.002
 6:   2.29x 3.97x  -0.060 +0.001
 7:   2.36x 3.98x  -0.057 -0.001
 8:   2.43x 3.98x  -0.067 -0.001
 9:         3.96x         +0.000
10:         3.99x         +0.000
11:         4.00x         +0.001
12:         4.00x         +0.001

x264 --preset medium --tune psnr --crf 30
 1:   1.00x 1.00x  +0.000 +0.000
 2:   1.54x 1.59x  -0.002 -0.003
 3:   2.01x 2.81x  -0.005 +0.000
 4:   2.51x 3.11x  -0.009 +0.000
 5:   2.89x 4.20x  -0.012 -0.000
 6:   3.27x 4.50x  -0.016 -0.000
 7:   3.58x 5.45x  -0.019 -0.002
 8:   3.79x 5.76x  -0.015 -0.002
 9:         6.49x         -0.000
10:         6.64x         -0.000
11:         6.94x         +0.000
12:         6.96x         +0.000

x264 --preset slower --tune psnr --crf 30
 1:   1.00x 1.00x  +0.000 +0.000
 2:   1.54x 1.83x  +0.000 +0.002
 3:   1.98x 2.21x  -0.006 +0.002
 4:   2.50x 2.61x  -0.011 +0.002
 5:   2.93x 3.94x  -0.018 +0.003
 6:   3.45x 4.19x  -0.024 +0.001
 7:   3.84x 4.52x  -0.028 -0.001
 8:   4.13x 5.04x  -0.026 -0.001
 9:         6.15x         +0.001
10:         6.24x         +0.001
11:         6.55x         -0.001
12:         6.89x         -0.001
x264-snapshot-20120103-2245-stable/doc/standards.txt0000644000175000001440000000134011700673342021160 0ustar  videolanusersx264 is written in C. The particular variant of C is: intersection of C99 and gcc>=3.4.
checkasm is written in gcc, with no attempt at compatibility with anything else.

We make the following additional assumptions which are true of real systems but not guaranteed by C99:
* Two's complement.
* Signed right-shifts are sign-extended.
* int is 32-bit or larger.

x86-specific assumptions:
* The stack is 16-byte aligned. We align it on entry to libx264 and on entry to any thread, but the compiler must preserve alignment after that.
* We call emms before any float operation and before returning from libx264, not after each mmx operation. So bad things could happen if the compiler inserts float operations where they aren't expected.
x264-snapshot-20120103-2245-stable/doc/regression_test.txt0000644000175000001440000000113711700673342022420 0ustar  videolanusersHere is one test method which checks that the encoder's
view of decoded pictures in the same as the decoder's view.
This ensures that there is no distortion besides what is
inherently caused by compression.

# Install and compile x264 :
svn co svn://svn.videolan.org/x264/trunk x264
cd x264
./configure
make
cd ..

# Install and compile JM reference decoder :
wget http://iphome.hhi.de/suehring/tml/download/jm17.2.zip
unzip jm17.2.zip
cd JM
sh unixprep.sh
cd ldecod
make
cd ../..

./x264/x264 input.yuv --dump-yuv fdec.yuv -o output.h264
./JM/bin/ldecod.exe -i output.h264 -o ref.yuv
diff ref.yuv fdec.yuv
x264-snapshot-20120103-2245-stable/doc/ratecontrol.txt0000644000175000001440000001203611700673342021535 0ustar  videolanusersA qualitative overview of x264's ratecontrol methods
By Loren Merritt

Historical note:
This document is outdated, but a significant part of it is still accurate.  Here are some important ways ratecontrol has changed since the authoring of this document:
- By default, MB-tree is used instead of qcomp for weighting frame quality based on complexity.  MB-tree is effectively a generalization of qcomp to the macroblock level.  MB-tree also replaces the constant offsets for B-frame quantizers.  The legacy algorithm is still available for low-latency applications.
- Adaptive quantization is now used to distribute quality among each frame; frames are no longer constant quantizer, even if MB-tree is off.
- VBV runs per-row rather than per-frame to improve accuracy.

x264's ratecontrol is based on libavcodec's, and is mostly empirical. But I can retroactively propose the following theoretical points which underlie most of the algorithms:

- You want the movie to be somewhere approaching constant quality. However, constant quality does not mean constant PSNR nor constant QP. Details are less noticeable in high-complexity or high-motion scenes, so you can get away with somewhat higher QP for the same perceived quality.
- On the other hand, you get more quality per bit if you spend those bits in scenes where motion compensation works well: A given artifact may stick around several seconds in a low-motion scene, and you only have to fix it in one frame to improve the quality of the whole scene.
- Both of the above are correlated with the number of bits it takes to encode a frame at a given QP.
- Given one encoding of a frame, we can predict the number of bits needed to encode it at a different QP. This prediction gets less accurate if the QPs are far apart.
- The importance of a frame depends on the number of other frames that are predicted from it. Hence I-frames get reduced QP depending on the number and complexity of following inter-frames, disposable B-frames get higher QP than P-frames, and referenced B-frames are between P-frames and disposable B-frames.


The modes:

    2pass:
Given some data about each frame of a 1st pass (e.g. generated by 1pass ABR, below), we try to choose QPs to maximize quality while matching a specified total size. This is separated into 3 parts:
(1) Before starting the 2nd pass, select the relative number of bits to allocate between frames. This pays no attention to the total size of the encode. The default formula, empirically selected to balance between the 1st 2 theoretical points, is "complexity ** 0.6", where complexity is defined to be the bit size of the frame at a constant QP (estimated from the 1st pass).
(2) Scale the results of (1) to fill the requested total size. Optional: Impose VBV limitations. Due to nonlinearities in the frame size predictor and in VBV, this is an iterative process.
(3) Now start encoding. After each frame, update future QPs to compensate for mispredictions in size. If the 2nd pass is consistently off from the predicted size (usually because we use slower compression options than the 1st pass), then we multiply all future frames' qscales by the reciprocal of the error. Additionally, there is a short-term compensation to prevent us from deviating too far from the desired size near the beginning (when we don't have much data for the global compensation) and near the end (when global doesn't have time to react).

    1pass, average bitrate:
The goal is the same as in 2pass, but here we don't have the benefit of a previous encode, so all ratecontrol must be done during the encode.
(1) This is the same as in 2pass, except that instead of estimating complexity from a previous encode, we run a fast motion estimation algo over a half-resolution version of the frame, and use the SATD residuals (these are also used in the decision between P- and B-frames). Also, we don't know the size or complexity of the following GOP, so I-frame bonus is based on the past.
(2) We don't know the complexities of future frames, so we can only scale based on the past. The scaling factor is chosen to be the one that would have resulted in the desired bitrate if it had been applied to all frames so far.
(3) Overflow compensation is the same as in 2pass. By tuning the strength of compensation, you can get anywhere from near the quality of 2pass (but unpredictable size, like +- 10%) to reasonably strict filesize but lower quality.

    1pass, constant bitrate (VBV compliant):
(1) Same as ABR.
(2) Scaling factor is based on a local average (dependent on VBV buffer size) instead of all past frames.
(3) Overflow compensation is stricter, and has an additional term to hard limit the QPs if the VBV is near empty. Note that no hard limit is done for a full VBV, so CBR may use somewhat less than the requested bitrate. Note also that if a frame violates VBV constraints despite the best efforts of prediction, it is not re-encoded.

    1pass, constant ratefactor:
(1) Same as ABR.
(2) The scaling factor is a constant based on the --crf argument.
(3) No overflow compensation is done.

    constant quantizer:
QPs are simply based on frame type.
x264-snapshot-20120103-2245-stable/config.sub0000755000175000001440000010277511700673342017670 0ustar  videolanusers#! /bin/sh
# Configuration validation subroutine script.
#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
#   Free Software Foundation, Inc.

timestamp='2009-08-19'

# This file is (in principle) common to ALL GNU software.
# The presence of a machine in this file suggests that SOME GNU software
# can handle that machine.  It does not imply ALL GNU software can.
#
# This file is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
# 02110-1301, USA.
#
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that program.


# Please send patches to <config-patches@gnu.org>.  Submit a context
# diff and a properly formatted GNU ChangeLog entry.
#
# Configuration subroutine to validate and canonicalize a configuration type.
# Supply the specified configuration type as an argument.
# If it is invalid, we print an error message on stderr and exit with code 1.
# Otherwise, we print the canonical config type on stdout and succeed.

# You can get the latest version of this script from:
# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD

# This file is supposed to be the same for all GNU packages
# and recognize all the CPU types, system types and aliases
# that are meaningful with *any* GNU software.
# Each package is responsible for reporting which valid configurations
# it does not support.  The user should be able to distinguish
# a failure to support a valid configuration from a meaningless
# configuration.

# The goal of this file is to map all the various variations of a given
# machine specification into a single specification in the form:
#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
# or in some cases, the newer four-part form:
#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
# It is wrong to echo any other type of specification.

me=`echo "$0" | sed -e 's,.*/,,'`

usage="\
Usage: $0 [OPTION] CPU-MFR-OPSYS
       $0 [OPTION] ALIAS

Canonicalize a configuration name.

Operation modes:
  -h, --help         print this help, then exit
  -t, --time-stamp   print date of last modification, then exit
  -v, --version      print version number, then exit

Report bugs and patches to <config-patches@gnu.org>."

version="\
GNU config.sub ($timestamp)

Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.

This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."

help="
Try \`$me --help' for more information."

# Parse command line
while test $# -gt 0 ; do
  case $1 in
    --time-stamp | --time* | -t )
       echo "$timestamp" ; exit ;;
    --version | -v )
       echo "$version" ; exit ;;
    --help | --h* | -h )
       echo "$usage"; exit ;;
    -- )     # Stop option processing
       shift; break ;;
    - )	# Use stdin as input.
       break ;;
    -* )
       echo "$me: invalid option $1$help"
       exit 1 ;;

    *local*)
       # First pass through any local machine types.
       echo $1
       exit ;;

    * )
       break ;;
  esac
done

case $# in
 0) echo "$me: missing argument$help" >&2
    exit 1;;
 1) ;;
 *) echo "$me: too many arguments$help" >&2
    exit 1;;
esac

# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
# Here we must recognize all the valid KERNEL-OS combinations.
maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
case $maybe_os in
  nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \
  uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \
  kopensolaris*-gnu* | \
  storm-chaos* | os2-emx* | rtmk-nova*)
    os=-$maybe_os
    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
    ;;
  *)
    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
    if [ $basic_machine != $1 ]
    then os=`echo $1 | sed 's/.*-/-/'`
    else os=; fi
    ;;
esac

### Let's recognize common machines as not being operating systems so
### that things like config.sub decstation-3100 work.  We also
### recognize some manufacturers as not being operating systems, so we
### can provide default operating systems below.
case $os in
	-sun*os*)
		# Prevent following clause from handling this invalid input.
		;;
	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
	-apple | -axis | -knuth | -cray | -microblaze)
		os=
		basic_machine=$1
		;;
        -bluegene*)
	        os=-cnk
		;;
	-sim | -cisco | -oki | -wec | -winbond)
		os=
		basic_machine=$1
		;;
	-scout)
		;;
	-wrs)
		os=-vxworks
		basic_machine=$1
		;;
	-chorusos*)
		os=-chorusos
		basic_machine=$1
		;;
 	-chorusrdb)
 		os=-chorusrdb
		basic_machine=$1
 		;;
	-hiux*)
		os=-hiuxwe2
		;;
	-sco6)
		os=-sco5v6
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-sco5)
		os=-sco3.2v5
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-sco4)
		os=-sco3.2v4
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-sco3.2.[4-9]*)
		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-sco3.2v[4-9]*)
		# Don't forget version if it is 3.2v4 or newer.
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-sco5v6*)
		# Don't forget version if it is 3.2v4 or newer.
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-sco*)
		os=-sco3.2v2
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-udk*)
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-isc)
		os=-isc2.2
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-clix*)
		basic_machine=clipper-intergraph
		;;
	-isc*)
		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
		;;
	-lynx*)
		os=-lynxos
		;;
	-ptx*)
		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
		;;
	-windowsnt*)
		os=`echo $os | sed -e 's/windowsnt/winnt/'`
		;;
	-psos*)
		os=-psos
		;;
	-mint | -mint[0-9]*)
		basic_machine=m68k-atari
		os=-mint
		;;
esac

# Decode aliases for certain CPU-COMPANY combinations.
case $basic_machine in
	# Recognize the basic CPU types without company name.
	# Some are omitted here because they have special meanings below.
	1750a | 580 \
	| a29k \
	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
	| am33_2.0 \
	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
	| bfin \
	| c4x | clipper \
	| d10v | d30v | dlx | dsp16xx \
	| fido | fr30 | frv \
	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
	| i370 | i860 | i960 | ia64 \
	| ip2k | iq2000 \
	| lm32 \
	| m32c | m32r | m32rle | m68000 | m68k | m88k \
	| maxq | mb | microblaze | mcore | mep | metag \
	| mips | mipsbe | mipseb | mipsel | mipsle \
	| mips16 \
	| mips64 | mips64el \
	| mips64octeon | mips64octeonel \
	| mips64orion | mips64orionel \
	| mips64r5900 | mips64r5900el \
	| mips64vr | mips64vrel \
	| mips64vr4100 | mips64vr4100el \
	| mips64vr4300 | mips64vr4300el \
	| mips64vr5000 | mips64vr5000el \
	| mips64vr5900 | mips64vr5900el \
	| mipsisa32 | mipsisa32el \
	| mipsisa32r2 | mipsisa32r2el \
	| mipsisa64 | mipsisa64el \
	| mipsisa64r2 | mipsisa64r2el \
	| mipsisa64sb1 | mipsisa64sb1el \
	| mipsisa64sr71k | mipsisa64sr71kel \
	| mipstx39 | mipstx39el \
	| mn10200 | mn10300 \
	| moxie \
	| mt \
	| msp430 \
	| nios | nios2 \
	| ns16k | ns32k \
	| or32 \
	| pdp10 | pdp11 | pj | pjl \
	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
	| pyramid \
	| score \
	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
	| sh64 | sh64le \
	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
	| spu | strongarm \
	| tahoe | thumb | tic4x | tic80 | tron \
	| v850 | v850e \
	| we32k \
	| x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \
	| z8k | z80)
		basic_machine=$basic_machine-unknown
		;;
	m6811 | m68hc11 | m6812 | m68hc12)
		# Motorola 68HC11/12.
		basic_machine=$basic_machine-unknown
		os=-none
		;;
	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
		;;
	ms1)
		basic_machine=mt-unknown
		;;

	# We use `pc' rather than `unknown'
	# because (1) that's what they normally are, and
	# (2) the word "unknown" tends to confuse beginning users.
	i*86 | x86_64)
	  basic_machine=$basic_machine-pc
	  ;;
	# Object if more than one company name word.
	*-*-*)
		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
		exit 1
		;;
	# Recognize the basic CPU types with company name.
	580-* \
	| a29k-* \
	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
	| avr-* | avr32-* \
	| bfin-* | bs2000-* \
	| c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \
	| clipper-* | craynv-* | cydra-* \
	| d10v-* | d30v-* | dlx-* \
	| elxsi-* \
	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
	| h8300-* | h8500-* \
	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
	| i*86-* | i860-* | i960-* | ia64-* \
	| ip2k-* | iq2000-* \
	| lm32-* \
	| m32c-* | m32r-* | m32rle-* \
	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
	| mips16-* \
	| mips64-* | mips64el-* \
	| mips64octeon-* | mips64octeonel-* \
	| mips64orion-* | mips64orionel-* \
	| mips64r5900-* | mips64r5900el-* \
	| mips64vr-* | mips64vrel-* \
	| mips64vr4100-* | mips64vr4100el-* \
	| mips64vr4300-* | mips64vr4300el-* \
	| mips64vr5000-* | mips64vr5000el-* \
	| mips64vr5900-* | mips64vr5900el-* \
	| mipsisa32-* | mipsisa32el-* \
	| mipsisa32r2-* | mipsisa32r2el-* \
	| mipsisa64-* | mipsisa64el-* \
	| mipsisa64r2-* | mipsisa64r2el-* \
	| mipsisa64sb1-* | mipsisa64sb1el-* \
	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
	| mipstx39-* | mipstx39el-* \
	| mmix-* \
	| mt-* \
	| msp430-* \
	| nios-* | nios2-* \
	| none-* | np1-* | ns16k-* | ns32k-* \
	| orion-* \
	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
	| pyramid-* \
	| romp-* | rs6000-* \
	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
	| sparclite-* \
	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \
	| tahoe-* | thumb-* \
	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* | tile-* \
	| tron-* \
	| v850-* | v850e-* | vax-* \
	| we32k-* \
	| x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \
	| xstormy16-* | xtensa*-* \
	| ymp-* \
	| z8k-* | z80-*)
		;;
	# Recognize the basic CPU types without company name, with glob match.
	xtensa*)
		basic_machine=$basic_machine-unknown
		;;
	# Recognize the various machine names and aliases which stand
	# for a CPU type and a company and sometimes even an OS.
	386bsd)
		basic_machine=i386-unknown
		os=-bsd
		;;
	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
		basic_machine=m68000-att
		;;
	3b*)
		basic_machine=we32k-att
		;;
	a29khif)
		basic_machine=a29k-amd
		os=-udi
		;;
    	abacus)
		basic_machine=abacus-unknown
		;;
	adobe68k)
		basic_machine=m68010-adobe
		os=-scout
		;;
	alliant | fx80)
		basic_machine=fx80-alliant
		;;
	altos | altos3068)
		basic_machine=m68k-altos
		;;
	am29k)
		basic_machine=a29k-none
		os=-bsd
		;;
	amd64)
		basic_machine=x86_64-pc
		;;
	amd64-*)
		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	amdahl)
		basic_machine=580-amdahl
		os=-sysv
		;;
	amiga | amiga-*)
		basic_machine=m68k-unknown
		;;
	amigaos | amigados)
		basic_machine=m68k-unknown
		os=-amigaos
		;;
	amigaunix | amix)
		basic_machine=m68k-unknown
		os=-sysv4
		;;
	apollo68)
		basic_machine=m68k-apollo
		os=-sysv
		;;
	apollo68bsd)
		basic_machine=m68k-apollo
		os=-bsd
		;;
	aros)
		basic_machine=i386-pc
		os=-aros
		;;
	aux)
		basic_machine=m68k-apple
		os=-aux
		;;
	balance)
		basic_machine=ns32k-sequent
		os=-dynix
		;;
	blackfin)
		basic_machine=bfin-unknown
		os=-linux
		;;
	blackfin-*)
		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
		os=-linux
		;;
	bluegene*)
		basic_machine=powerpc-ibm
		os=-cnk
		;;
	c90)
		basic_machine=c90-cray
		os=-unicos
		;;
        cegcc)
		basic_machine=arm-unknown
		os=-cegcc
		;;
	convex-c1)
		basic_machine=c1-convex
		os=-bsd
		;;
	convex-c2)
		basic_machine=c2-convex
		os=-bsd
		;;
	convex-c32)
		basic_machine=c32-convex
		os=-bsd
		;;
	convex-c34)
		basic_machine=c34-convex
		os=-bsd
		;;
	convex-c38)
		basic_machine=c38-convex
		os=-bsd
		;;
	cray | j90)
		basic_machine=j90-cray
		os=-unicos
		;;
	craynv)
		basic_machine=craynv-cray
		os=-unicosmp
		;;
	cr16)
		basic_machine=cr16-unknown
		os=-elf
		;;
	crds | unos)
		basic_machine=m68k-crds
		;;
	crisv32 | crisv32-* | etraxfs*)
		basic_machine=crisv32-axis
		;;
	cris | cris-* | etrax*)
		basic_machine=cris-axis
		;;
	crx)
		basic_machine=crx-unknown
		os=-elf
		;;
	da30 | da30-*)
		basic_machine=m68k-da30
		;;
	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
		basic_machine=mips-dec
		;;
	decsystem10* | dec10*)
		basic_machine=pdp10-dec
		os=-tops10
		;;
	decsystem20* | dec20*)
		basic_machine=pdp10-dec
		os=-tops20
		;;
	delta | 3300 | motorola-3300 | motorola-delta \
	      | 3300-motorola | delta-motorola)
		basic_machine=m68k-motorola
		;;
	delta88)
		basic_machine=m88k-motorola
		os=-sysv3
		;;
	dicos)
		basic_machine=i686-pc
		os=-dicos
		;;
	djgpp)
		basic_machine=i586-pc
		os=-msdosdjgpp
		;;
	dpx20 | dpx20-*)
		basic_machine=rs6000-bull
		os=-bosx
		;;
	dpx2* | dpx2*-bull)
		basic_machine=m68k-bull
		os=-sysv3
		;;
	ebmon29k)
		basic_machine=a29k-amd
		os=-ebmon
		;;
	elxsi)
		basic_machine=elxsi-elxsi
		os=-bsd
		;;
	encore | umax | mmax)
		basic_machine=ns32k-encore
		;;
	es1800 | OSE68k | ose68k | ose | OSE)
		basic_machine=m68k-ericsson
		os=-ose
		;;
	fx2800)
		basic_machine=i860-alliant
		;;
	genix)
		basic_machine=ns32k-ns
		;;
	gmicro)
		basic_machine=tron-gmicro
		os=-sysv
		;;
	go32)
		basic_machine=i386-pc
		os=-go32
		;;
	h3050r* | hiux*)
		basic_machine=hppa1.1-hitachi
		os=-hiuxwe2
		;;
	h8300hms)
		basic_machine=h8300-hitachi
		os=-hms
		;;
	h8300xray)
		basic_machine=h8300-hitachi
		os=-xray
		;;
	h8500hms)
		basic_machine=h8500-hitachi
		os=-hms
		;;
	harris)
		basic_machine=m88k-harris
		os=-sysv3
		;;
	hp300-*)
		basic_machine=m68k-hp
		;;
	hp300bsd)
		basic_machine=m68k-hp
		os=-bsd
		;;
	hp300hpux)
		basic_machine=m68k-hp
		os=-hpux
		;;
	hp3k9[0-9][0-9] | hp9[0-9][0-9])
		basic_machine=hppa1.0-hp
		;;
	hp9k2[0-9][0-9] | hp9k31[0-9])
		basic_machine=m68000-hp
		;;
	hp9k3[2-9][0-9])
		basic_machine=m68k-hp
		;;
	hp9k6[0-9][0-9] | hp6[0-9][0-9])
		basic_machine=hppa1.0-hp
		;;
	hp9k7[0-79][0-9] | hp7[0-79][0-9])
		basic_machine=hppa1.1-hp
		;;
	hp9k78[0-9] | hp78[0-9])
		# FIXME: really hppa2.0-hp
		basic_machine=hppa1.1-hp
		;;
	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
		# FIXME: really hppa2.0-hp
		basic_machine=hppa1.1-hp
		;;
	hp9k8[0-9][13679] | hp8[0-9][13679])
		basic_machine=hppa1.1-hp
		;;
	hp9k8[0-9][0-9] | hp8[0-9][0-9])
		basic_machine=hppa1.0-hp
		;;
	hppa-next)
		os=-nextstep3
		;;
	hppaosf)
		basic_machine=hppa1.1-hp
		os=-osf
		;;
	hppro)
		basic_machine=hppa1.1-hp
		os=-proelf
		;;
	i370-ibm* | ibm*)
		basic_machine=i370-ibm
		;;
# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
	i*86v32)
		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
		os=-sysv32
		;;
	i*86v4*)
		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
		os=-sysv4
		;;
	i*86v)
		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
		os=-sysv
		;;
	i*86sol2)
		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
		os=-solaris2
		;;
	i386mach)
		basic_machine=i386-mach
		os=-mach
		;;
	i386-vsta | vsta)
		basic_machine=i386-unknown
		os=-vsta
		;;
	iris | iris4d)
		basic_machine=mips-sgi
		case $os in
		    -irix*)
			;;
		    *)
			os=-irix4
			;;
		esac
		;;
	isi68 | isi)
		basic_machine=m68k-isi
		os=-sysv
		;;
	m68knommu)
		basic_machine=m68k-unknown
		os=-linux
		;;
	m68knommu-*)
		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
		os=-linux
		;;
	m88k-omron*)
		basic_machine=m88k-omron
		;;
	magnum | m3230)
		basic_machine=mips-mips
		os=-sysv
		;;
	merlin)
		basic_machine=ns32k-utek
		os=-sysv
		;;
        microblaze)
		basic_machine=microblaze-xilinx
		;;
	mingw32)
		basic_machine=i386-pc
		os=-mingw32
		;;
	mingw32ce)
		basic_machine=arm-unknown
		os=-mingw32ce
		;;
	miniframe)
		basic_machine=m68000-convergent
		;;
	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
		basic_machine=m68k-atari
		os=-mint
		;;
	mips3*-*)
		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
		;;
	mips3*)
		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
		;;
	monitor)
		basic_machine=m68k-rom68k
		os=-coff
		;;
	morphos)
		basic_machine=powerpc-unknown
		os=-morphos
		;;
	msdos)
		basic_machine=i386-pc
		os=-msdos
		;;
	ms1-*)
		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
		;;
	mvs)
		basic_machine=i370-ibm
		os=-mvs
		;;
	ncr3000)
		basic_machine=i486-ncr
		os=-sysv4
		;;
	netbsd386)
		basic_machine=i386-unknown
		os=-netbsd
		;;
	netwinder)
		basic_machine=armv4l-rebel
		os=-linux
		;;
	news | news700 | news800 | news900)
		basic_machine=m68k-sony
		os=-newsos
		;;
	news1000)
		basic_machine=m68030-sony
		os=-newsos
		;;
	news-3600 | risc-news)
		basic_machine=mips-sony
		os=-newsos
		;;
	necv70)
		basic_machine=v70-nec
		os=-sysv
		;;
	next | m*-next )
		basic_machine=m68k-next
		case $os in
		    -nextstep* )
			;;
		    -ns2*)
		      os=-nextstep2
			;;
		    *)
		      os=-nextstep3
			;;
		esac
		;;
	nh3000)
		basic_machine=m68k-harris
		os=-cxux
		;;
	nh[45]000)
		basic_machine=m88k-harris
		os=-cxux
		;;
	nindy960)
		basic_machine=i960-intel
		os=-nindy
		;;
	mon960)
		basic_machine=i960-intel
		os=-mon960
		;;
	nonstopux)
		basic_machine=mips-compaq
		os=-nonstopux
		;;
	np1)
		basic_machine=np1-gould
		;;
	nsr-tandem)
		basic_machine=nsr-tandem
		;;
	op50n-* | op60c-*)
		basic_machine=hppa1.1-oki
		os=-proelf
		;;
	openrisc | openrisc-*)
		basic_machine=or32-unknown
		;;
	os400)
		basic_machine=powerpc-ibm
		os=-os400
		;;
	OSE68000 | ose68000)
		basic_machine=m68000-ericsson
		os=-ose
		;;
	os68k)
		basic_machine=m68k-none
		os=-os68k
		;;
	pa-hitachi)
		basic_machine=hppa1.1-hitachi
		os=-hiuxwe2
		;;
	paragon)
		basic_machine=i860-intel
		os=-osf
		;;
	parisc)
		basic_machine=hppa-unknown
		os=-linux
		;;
	parisc-*)
		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
		os=-linux
		;;
	pbd)
		basic_machine=sparc-tti
		;;
	pbb)
		basic_machine=m68k-tti
		;;
	pc532 | pc532-*)
		basic_machine=ns32k-pc532
		;;
	pc98)
		basic_machine=i386-pc
		;;
	pc98-*)
		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	pentium | p5 | k5 | k6 | nexgen | viac3)
		basic_machine=i586-pc
		;;
	pentiumpro | p6 | 6x86 | athlon | athlon_*)
		basic_machine=i686-pc
		;;
	pentiumii | pentium2 | pentiumiii | pentium3)
		basic_machine=i686-pc
		;;
	pentium4)
		basic_machine=i786-pc
		;;
	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	pentiumpro-* | p6-* | 6x86-* | athlon-*)
		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	pentium4-*)
		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	pn)
		basic_machine=pn-gould
		;;
	power)	basic_machine=power-ibm
		;;
	ppc)	basic_machine=powerpc-unknown
		;;
	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	ppcle | powerpclittle | ppc-le | powerpc-little)
		basic_machine=powerpcle-unknown
		;;
	ppcle-* | powerpclittle-*)
		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	ppc64)	basic_machine=powerpc64-unknown
		;;
	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
		basic_machine=powerpc64le-unknown
		;;
	ppc64le-* | powerpc64little-*)
		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
		;;
	ps2)
		basic_machine=i386-ibm
		;;
	pw32)
		basic_machine=i586-unknown
		os=-pw32
		;;
	rdos)
		basic_machine=i386-pc
		os=-rdos
		;;
	rom68k)
		basic_machine=m68k-rom68k
		os=-coff
		;;
	rm[46]00)
		basic_machine=mips-siemens
		;;
	rtpc | rtpc-*)
		basic_machine=romp-ibm
		;;
	s390 | s390-*)
		basic_machine=s390-ibm
		;;
	s390x | s390x-*)
		basic_machine=s390x-ibm
		;;
	sa29200)
		basic_machine=a29k-amd
		os=-udi
		;;
	sb1)
		basic_machine=mipsisa64sb1-unknown
		;;
	sb1el)
		basic_machine=mipsisa64sb1el-unknown
		;;
	sde)
		basic_machine=mipsisa32-sde
		os=-elf
		;;
	sei)
		basic_machine=mips-sei
		os=-seiux
		;;
	sequent)
		basic_machine=i386-sequent
		;;
	sh)
		basic_machine=sh-hitachi
		os=-hms
		;;
	sh5el)
		basic_machine=sh5le-unknown
		;;
	sh64)
		basic_machine=sh64-unknown
		;;
	sparclite-wrs | simso-wrs)
		basic_machine=sparclite-wrs
		os=-vxworks
		;;
	sps7)
		basic_machine=m68k-bull
		os=-sysv2
		;;
	spur)
		basic_machine=spur-unknown
		;;
	st2000)
		basic_machine=m68k-tandem
		;;
	stratus)
		basic_machine=i860-stratus
		os=-sysv4
		;;
	sun2)
		basic_machine=m68000-sun
		;;
	sun2os3)
		basic_machine=m68000-sun
		os=-sunos3
		;;
	sun2os4)
		basic_machine=m68000-sun
		os=-sunos4
		;;
	sun3os3)
		basic_machine=m68k-sun
		os=-sunos3
		;;
	sun3os4)
		basic_machine=m68k-sun
		os=-sunos4
		;;
	sun4os3)
		basic_machine=sparc-sun
		os=-sunos3
		;;
	sun4os4)
		basic_machine=sparc-sun
		os=-sunos4
		;;
	sun4sol2)
		basic_machine=sparc-sun
		os=-solaris2
		;;
	sun3 | sun3-*)
		basic_machine=m68k-sun
		;;
	sun4)
		basic_machine=sparc-sun
		;;
	sun386 | sun386i | roadrunner)
		basic_machine=i386-sun
		;;
	sv1)
		basic_machine=sv1-cray
		os=-unicos
		;;
	symmetry)
		basic_machine=i386-sequent
		os=-dynix
		;;
	t3e)
		basic_machine=alphaev5-cray
		os=-unicos
		;;
	t90)
		basic_machine=t90-cray
		os=-unicos
		;;
	tic54x | c54x*)
		basic_machine=tic54x-unknown
		os=-coff
		;;
	tic55x | c55x*)
		basic_machine=tic55x-unknown
		os=-coff
		;;
	tic6x | c6x*)
		basic_machine=tic6x-unknown
		os=-coff
		;;
	tile*)
		basic_machine=tile-unknown
		os=-linux-gnu
		;;
	tx39)
		basic_machine=mipstx39-unknown
		;;
	tx39el)
		basic_machine=mipstx39el-unknown
		;;
	toad1)
		basic_machine=pdp10-xkl
		os=-tops20
		;;
	tower | tower-32)
		basic_machine=m68k-ncr
		;;
	tpf)
		basic_machine=s390x-ibm
		os=-tpf
		;;
	udi29k)
		basic_machine=a29k-amd
		os=-udi
		;;
	ultra3)
		basic_machine=a29k-nyu
		os=-sym1
		;;
	v810 | necv810)
		basic_machine=v810-nec
		os=-none
		;;
	vaxv)
		basic_machine=vax-dec
		os=-sysv
		;;
	vms)
		basic_machine=vax-dec
		os=-vms
		;;
	vpp*|vx|vx-*)
		basic_machine=f301-fujitsu
		;;
	vxworks960)
		basic_machine=i960-wrs
		os=-vxworks
		;;
	vxworks68)
		basic_machine=m68k-wrs
		os=-vxworks
		;;
	vxworks29k)
		basic_machine=a29k-wrs
		os=-vxworks
		;;
	w65*)
		basic_machine=w65-wdc
		os=-none
		;;
	w89k-*)
		basic_machine=hppa1.1-winbond
		os=-proelf
		;;
	xbox)
		basic_machine=i686-pc
		os=-mingw32
		;;
	xps | xps100)
		basic_machine=xps100-honeywell
		;;
	ymp)
		basic_machine=ymp-cray
		os=-unicos
		;;
	z8k-*-coff)
		basic_machine=z8k-unknown
		os=-sim
		;;
	z80-*-coff)
		basic_machine=z80-unknown
		os=-sim
		;;
	none)
		basic_machine=none-none
		os=-none
		;;

# Here we handle the default manufacturer of certain CPU types.  It is in
# some cases the only manufacturer, in others, it is the most popular.
	w89k)
		basic_machine=hppa1.1-winbond
		;;
	op50n)
		basic_machine=hppa1.1-oki
		;;
	op60c)
		basic_machine=hppa1.1-oki
		;;
	romp)
		basic_machine=romp-ibm
		;;
	mmix)
		basic_machine=mmix-knuth
		;;
	rs6000)
		basic_machine=rs6000-ibm
		;;
	vax)
		basic_machine=vax-dec
		;;
	pdp10)
		# there are many clones, so DEC is not a safe bet
		basic_machine=pdp10-unknown
		;;
	pdp11)
		basic_machine=pdp11-dec
		;;
	we32k)
		basic_machine=we32k-att
		;;
	sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
		basic_machine=sh-unknown
		;;
	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
		basic_machine=sparc-sun
		;;
	cydra)
		basic_machine=cydra-cydrome
		;;
	orion)
		basic_machine=orion-highlevel
		;;
	orion105)
		basic_machine=clipper-highlevel
		;;
	mac | mpw | mac-mpw)
		basic_machine=m68k-apple
		;;
	pmac | pmac-mpw)
		basic_machine=powerpc-apple
		;;
	*-unknown)
		# Make sure to match an already-canonicalized machine name.
		;;
	*)
		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
		exit 1
		;;
esac

# Here we canonicalize certain aliases for manufacturers.
case $basic_machine in
	*-digital*)
		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
		;;
	*-commodore*)
		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
		;;
	*)
		;;
esac

# Decode manufacturer-specific aliases for certain operating systems.

if [ x"$os" != x"" ]
then
case $os in
        # First match some system type aliases
        # that might get confused with valid system types.
	# -solaris* is a basic system type, with this one exception.
	-solaris1 | -solaris1.*)
		os=`echo $os | sed -e 's|solaris1|sunos4|'`
		;;
	-solaris)
		os=-solaris2
		;;
	-svr4*)
		os=-sysv4
		;;
	-unixware*)
		os=-sysv4.2uw
		;;
	-gnu/linux*)
		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
		;;
	# First accept the basic system types.
	# The portable systems comes first.
	# Each alternative MUST END IN A *, to match a version number.
	# -sysv* is not here because it comes later, after sysvr4.
	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
	      | -kopensolaris* \
	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
	      | -aos* | -aros* \
	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
	      | -openbsd* | -solidbsd* \
	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
	      | -chorusos* | -chorusrdb* | -cegcc* \
	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
	      | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \
	      | -uxpv* | -beos* | -mpeix* | -udk* \
	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops*)
	# Remember, each alternative MUST END IN *, to match a version number.
		;;
	-qnx*)
		case $basic_machine in
		    x86-* | i*86-*)
			;;
		    *)
			os=-nto$os
			;;
		esac
		;;
	-nto-qnx*)
		;;
	-nto*)
		os=`echo $os | sed -e 's|nto|nto-qnx|'`
		;;
	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
		;;
	-mac*)
		os=`echo $os | sed -e 's|mac|macos|'`
		;;
	-linux-dietlibc)
		os=-linux-dietlibc
		;;
	-linux*)
		os=`echo $os | sed -e 's|linux|linux-gnu|'`
		;;
	-sunos5*)
		os=`echo $os | sed -e 's|sunos5|solaris2|'`
		;;
	-sunos6*)
		os=`echo $os | sed -e 's|sunos6|solaris3|'`
		;;
	-opened*)
		os=-openedition
		;;
        -os400*)
		os=-os400
		;;
	-wince*)
		os=-wince
		;;
	-osfrose*)
		os=-osfrose
		;;
	-osf*)
		os=-osf
		;;
	-utek*)
		os=-bsd
		;;
	-dynix*)
		os=-bsd
		;;
	-acis*)
		os=-aos
		;;
	-atheos*)
		os=-atheos
		;;
	-syllable*)
		os=-syllable
		;;
	-386bsd)
		os=-bsd
		;;
	-ctix* | -uts*)
		os=-sysv
		;;
	-nova*)
		os=-rtmk-nova
		;;
	-ns2 )
		os=-nextstep2
		;;
	-nsk*)
		os=-nsk
		;;
	# Preserve the version number of sinix5.
	-sinix5.*)
		os=`echo $os | sed -e 's|sinix|sysv|'`
		;;
	-sinix*)
		os=-sysv4
		;;
        -tpf*)
		os=-tpf
		;;
	-triton*)
		os=-sysv3
		;;
	-oss*)
		os=-sysv3
		;;
	-svr4)
		os=-sysv4
		;;
	-svr3)
		os=-sysv3
		;;
	-sysvr4)
		os=-sysv4
		;;
	# This must come after -sysvr4.
	-sysv*)
		;;
	-ose*)
		os=-ose
		;;
	-es1800*)
		os=-ose
		;;
	-xenix)
		os=-xenix
		;;
	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
		os=-mint
		;;
	-aros*)
		os=-aros
		;;
	-kaos*)
		os=-kaos
		;;
	-zvmoe)
		os=-zvmoe
		;;
	-dicos*)
		os=-dicos
		;;
	-none)
		;;
	*)
		# Get rid of the `-' at the beginning of $os.
		os=`echo $os | sed 's/[^-]*-//'`
		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
		exit 1
		;;
esac
else

# Here we handle the default operating systems that come with various machines.
# The value should be what the vendor currently ships out the door with their
# machine or put another way, the most popular os provided with the machine.

# Note that if you're going to try to match "-MANUFACTURER" here (say,
# "-sun"), then you have to tell the case statement up towards the top
# that MANUFACTURER isn't an operating system.  Otherwise, code above
# will signal an error saying that MANUFACTURER isn't an operating
# system, and we'll never get to this point.

case $basic_machine in
        score-*)
		os=-elf
		;;
        spu-*)
		os=-elf
		;;
	*-acorn)
		os=-riscix1.2
		;;
	arm*-rebel)
		os=-linux
		;;
	arm*-semi)
		os=-aout
		;;
        c4x-* | tic4x-*)
        	os=-coff
		;;
	# This must come before the *-dec entry.
	pdp10-*)
		os=-tops20
		;;
	pdp11-*)
		os=-none
		;;
	*-dec | vax-*)
		os=-ultrix4.2
		;;
	m68*-apollo)
		os=-domain
		;;
	i386-sun)
		os=-sunos4.0.2
		;;
	m68000-sun)
		os=-sunos3
		# This also exists in the configure program, but was not the
		# default.
		# os=-sunos4
		;;
	m68*-cisco)
		os=-aout
		;;
        mep-*)
		os=-elf
		;;
	mips*-cisco)
		os=-elf
		;;
	mips*-*)
		os=-elf
		;;
	or32-*)
		os=-coff
		;;
	*-tti)	# must be before sparc entry or we get the wrong os.
		os=-sysv3
		;;
	sparc-* | *-sun)
		os=-sunos4.1.1
		;;
	*-be)
		os=-beos
		;;
	*-haiku)
		os=-haiku
		;;
	*-ibm)
		os=-aix
		;;
    	*-knuth)
		os=-mmixware
		;;
	*-wec)
		os=-proelf
		;;
	*-winbond)
		os=-proelf
		;;
	*-oki)
		os=-proelf
		;;
	*-hp)
		os=-hpux
		;;
	*-hitachi)
		os=-hiux
		;;
	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
		os=-sysv
		;;
	*-cbm)
		os=-amigaos
		;;
	*-dg)
		os=-dgux
		;;
	*-dolphin)
		os=-sysv3
		;;
	m68k-ccur)
		os=-rtu
		;;
	m88k-omron*)
		os=-luna
		;;
	*-next )
		os=-nextstep
		;;
	*-sequent)
		os=-ptx
		;;
	*-crds)
		os=-unos
		;;
	*-ns)
		os=-genix
		;;
	i370-*)
		os=-mvs
		;;
	*-next)
		os=-nextstep3
		;;
	*-gould)
		os=-sysv
		;;
	*-highlevel)
		os=-bsd
		;;
	*-encore)
		os=-bsd
		;;
	*-sgi)
		os=-irix
		;;
	*-siemens)
		os=-sysv4
		;;
	*-masscomp)
		os=-rtu
		;;
	f30[01]-fujitsu | f700-fujitsu)
		os=-uxpv
		;;
	*-rom68k)
		os=-coff
		;;
	*-*bug)
		os=-coff
		;;
	*-apple)
		os=-macos
		;;
	*-atari*)
		os=-mint
		;;
	*)
		os=-none
		;;
esac
fi

# Here we handle the case where we know the os, and the CPU type, but not the
# manufacturer.  We pick the logical manufacturer.
vendor=unknown
case $basic_machine in
	*-unknown)
		case $os in
			-riscix*)
				vendor=acorn
				;;
			-sunos*)
				vendor=sun
				;;
			-cnk*|-aix*)
				vendor=ibm
				;;
			-beos*)
				vendor=be
				;;
			-hpux*)
				vendor=hp
				;;
			-mpeix*)
				vendor=hp
				;;
			-hiux*)
				vendor=hitachi
				;;
			-unos*)
				vendor=crds
				;;
			-dgux*)
				vendor=dg
				;;
			-luna*)
				vendor=omron
				;;
			-genix*)
				vendor=ns
				;;
			-mvs* | -opened*)
				vendor=ibm
				;;
			-os400*)
				vendor=ibm
				;;
			-ptx*)
				vendor=sequent
				;;
			-tpf*)
				vendor=ibm
				;;
			-vxsim* | -vxworks* | -windiss*)
				vendor=wrs
				;;
			-aux*)
				vendor=apple
				;;
			-hms*)
				vendor=hitachi
				;;
			-mpw* | -macos*)
				vendor=apple
				;;
			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
				vendor=atari
				;;
			-vos*)
				vendor=stratus
				;;
		esac
		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
		;;
esac

echo $basic_machine$os
exit

# Local variables:
# eval: (add-hook 'write-file-hooks 'time-stamp)
# time-stamp-start: "timestamp='"
# time-stamp-format: "%:y-%02m-%02d"
# time-stamp-end: "'"
# End:
x264-snapshot-20120103-2245-stable/config.guess0000755000175000001440000012723611700673342020224 0ustar  videolanusers#! /bin/sh
# Attempt to guess a canonical system name.
#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
#   Free Software Foundation, Inc.

timestamp='2009-09-18'

# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
# 02110-1301, USA.
#
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that program.


# Originally written by Per Bothner.  Please send patches (context
# diff format) to <config-patches@gnu.org> and include a ChangeLog
# entry.
#
# This script attempts to guess a canonical system name similar to
# config.sub.  If it succeeds, it prints the system name on stdout, and
# exits with 0.  Otherwise, it exits with 1.
#
# You can get the latest version of this script from:
# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD

me=`echo "$0" | sed -e 's,.*/,,'`

usage="\
Usage: $0 [OPTION]

Output the configuration name of the system \`$me' is run on.

Operation modes:
  -h, --help         print this help, then exit
  -t, --time-stamp   print date of last modification, then exit
  -v, --version      print version number, then exit

Report bugs and patches to <config-patches@gnu.org>."

version="\
GNU config.guess ($timestamp)

Originally written by Per Bothner.
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.

This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."

help="
Try \`$me --help' for more information."

# Parse command line
while test $# -gt 0 ; do
  case $1 in
    --time-stamp | --time* | -t )
       echo "$timestamp" ; exit ;;
    --version | -v )
       echo "$version" ; exit ;;
    --help | --h* | -h )
       echo "$usage"; exit ;;
    -- )     # Stop option processing
       shift; break ;;
    - )	# Use stdin as input.
       break ;;
    -* )
       echo "$me: invalid option $1$help" >&2
       exit 1 ;;
    * )
       break ;;
  esac
done

if test $# != 0; then
  echo "$me: too many arguments$help" >&2
  exit 1
fi

trap 'exit 1' 1 2 15

# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
# compiler to aid in system detection is discouraged as it requires
# temporary files to be created and, as you can see below, it is a
# headache to deal with in a portable fashion.

# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
# use `HOST_CC' if defined, but it is deprecated.

# Portable tmp directory creation inspired by the Autoconf team.

set_cc_for_build='
trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
: ${TMPDIR=/tmp} ;
 { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
 { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
 { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
 { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
dummy=$tmp/dummy ;
tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
case $CC_FOR_BUILD,$HOST_CC,$CC in
 ,,)    echo "int x;" > $dummy.c ;
	for c in cc gcc c89 c99 ; do
	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
	     CC_FOR_BUILD="$c"; break ;
	  fi ;
	done ;
	if test x"$CC_FOR_BUILD" = x ; then
	  CC_FOR_BUILD=no_compiler_found ;
	fi
	;;
 ,,*)   CC_FOR_BUILD=$CC ;;
 ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
esac ; set_cc_for_build= ;'

# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
# (ghazi@noc.rutgers.edu 1994-08-24)
if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
	PATH=$PATH:/.attbin ; export PATH
fi

UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown

# Note: order is significant - the case branches are not exclusive.

case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
    *:NetBSD:*:*)
	# NetBSD (nbsd) targets should (where applicable) match one or
	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
	# switched to ELF, *-*-netbsd* would select the old
	# object file format.  This provides both forward
	# compatibility and a consistent mechanism for selecting the
	# object file format.
	#
	# Note: NetBSD doesn't particularly care about the vendor
	# portion of the name.  We always set it to "unknown".
	sysctl="sysctl -n hw.machine_arch"
	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
	case "${UNAME_MACHINE_ARCH}" in
	    armeb) machine=armeb-unknown ;;
	    arm*) machine=arm-unknown ;;
	    sh3el) machine=shl-unknown ;;
	    sh3eb) machine=sh-unknown ;;
	    sh5el) machine=sh5le-unknown ;;
	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
	esac
	# The Operating System including object format, if it has switched
	# to ELF recently, or will in the future.
	case "${UNAME_MACHINE_ARCH}" in
	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
		eval $set_cc_for_build
		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
			| grep -q __ELF__
		then
		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
		    # Return netbsd for either.  FIX?
		    os=netbsd
		else
		    os=netbsdelf
		fi
		;;
	    *)
	        os=netbsd
		;;
	esac
	# The OS release
	# Debian GNU/NetBSD machines have a different userland, and
	# thus, need a distinct triplet. However, they do not need
	# kernel version information, so it can be replaced with a
	# suitable tag, in the style of linux-gnu.
	case "${UNAME_VERSION}" in
	    Debian*)
		release='-gnu'
		;;
	    *)
		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
		;;
	esac
	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
	# contains redundant information, the shorter form:
	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
	echo "${machine}-${os}${release}"
	exit ;;
    *:OpenBSD:*:*)
	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
	exit ;;
    *:ekkoBSD:*:*)
	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
	exit ;;
    *:SolidBSD:*:*)
	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
	exit ;;
    macppc:MirBSD:*:*)
	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
	exit ;;
    *:MirBSD:*:*)
	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
	exit ;;
    alpha:OSF1:*:*)
	case $UNAME_RELEASE in
	*4.0)
		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
		;;
	*5.*)
	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
		;;
	esac
	# According to Compaq, /usr/sbin/psrinfo has been available on
	# OSF/1 and Tru64 systems produced since 1995.  I hope that
	# covers most systems running today.  This code pipes the CPU
	# types through head -n 1, so we only detect the type of CPU 0.
	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
	case "$ALPHA_CPU_TYPE" in
	    "EV4 (21064)")
		UNAME_MACHINE="alpha" ;;
	    "EV4.5 (21064)")
		UNAME_MACHINE="alpha" ;;
	    "LCA4 (21066/21068)")
		UNAME_MACHINE="alpha" ;;
	    "EV5 (21164)")
		UNAME_MACHINE="alphaev5" ;;
	    "EV5.6 (21164A)")
		UNAME_MACHINE="alphaev56" ;;
	    "EV5.6 (21164PC)")
		UNAME_MACHINE="alphapca56" ;;
	    "EV5.7 (21164PC)")
		UNAME_MACHINE="alphapca57" ;;
	    "EV6 (21264)")
		UNAME_MACHINE="alphaev6" ;;
	    "EV6.7 (21264A)")
		UNAME_MACHINE="alphaev67" ;;
	    "EV6.8CB (21264C)")
		UNAME_MACHINE="alphaev68" ;;
	    "EV6.8AL (21264B)")
		UNAME_MACHINE="alphaev68" ;;
	    "EV6.8CX (21264D)")
		UNAME_MACHINE="alphaev68" ;;
	    "EV6.9A (21264/EV69A)")
		UNAME_MACHINE="alphaev69" ;;
	    "EV7 (21364)")
		UNAME_MACHINE="alphaev7" ;;
	    "EV7.9 (21364A)")
		UNAME_MACHINE="alphaev79" ;;
	esac
	# A Pn.n version is a patched version.
	# A Vn.n version is a released version.
	# A Tn.n version is a released field test version.
	# A Xn.n version is an unreleased experimental baselevel.
	# 1.2 uses "1.2" for uname -r.
	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
	exit ;;
    Alpha\ *:Windows_NT*:*)
	# How do we know it's Interix rather than the generic POSIX subsystem?
	# Should we change UNAME_MACHINE based on the output of uname instead
	# of the specific Alpha model?
	echo alpha-pc-interix
	exit ;;
    21064:Windows_NT:50:3)
	echo alpha-dec-winnt3.5
	exit ;;
    Amiga*:UNIX_System_V:4.0:*)
	echo m68k-unknown-sysv4
	exit ;;
    *:[Aa]miga[Oo][Ss]:*:*)
	echo ${UNAME_MACHINE}-unknown-amigaos
	exit ;;
    *:[Mm]orph[Oo][Ss]:*:*)
	echo ${UNAME_MACHINE}-unknown-morphos
	exit ;;
    *:OS/390:*:*)
	echo i370-ibm-openedition
	exit ;;
    *:z/VM:*:*)
	echo s390-ibm-zvmoe
	exit ;;
    *:OS400:*:*)
        echo powerpc-ibm-os400
	exit ;;
    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
	echo arm-acorn-riscix${UNAME_RELEASE}
	exit ;;
    arm:riscos:*:*|arm:RISCOS:*:*)
	echo arm-unknown-riscos
	exit ;;
    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
	echo hppa1.1-hitachi-hiuxmpp
	exit ;;
    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
	if test "`(/bin/universe) 2>/dev/null`" = att ; then
		echo pyramid-pyramid-sysv3
	else
		echo pyramid-pyramid-bsd
	fi
	exit ;;
    NILE*:*:*:dcosx)
	echo pyramid-pyramid-svr4
	exit ;;
    DRS?6000:unix:4.0:6*)
	echo sparc-icl-nx6
	exit ;;
    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
	case `/usr/bin/uname -p` in
	    sparc) echo sparc-icl-nx7; exit ;;
	esac ;;
    s390x:SunOS:*:*)
	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
	exit ;;
    sun4H:SunOS:5.*:*)
	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
	exit ;;
    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
	exit ;;
    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
	eval $set_cc_for_build
	SUN_ARCH="i386"
	# If there is a compiler, see if it is configured for 64-bit objects.
	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
	# This test works for both compilers.
	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
		grep IS_64BIT_ARCH >/dev/null
	    then
		SUN_ARCH="x86_64"
	    fi
	fi
	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
	exit ;;
    sun4*:SunOS:6*:*)
	# According to config.sub, this is the proper way to canonicalize
	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
	# it's likely to be more like Solaris than SunOS4.
	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
	exit ;;
    sun4*:SunOS:*:*)
	case "`/usr/bin/arch -k`" in
	    Series*|S4*)
		UNAME_RELEASE=`uname -v`
		;;
	esac
	# Japanese Language versions have a version number like `4.1.3-JL'.
	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
	exit ;;
    sun3*:SunOS:*:*)
	echo m68k-sun-sunos${UNAME_RELEASE}
	exit ;;
    sun*:*:4.2BSD:*)
	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
	case "`/bin/arch`" in
	    sun3)
		echo m68k-sun-sunos${UNAME_RELEASE}
		;;
	    sun4)
		echo sparc-sun-sunos${UNAME_RELEASE}
		;;
	esac
	exit ;;
    aushp:SunOS:*:*)
	echo sparc-auspex-sunos${UNAME_RELEASE}
	exit ;;
    # The situation for MiNT is a little confusing.  The machine name
    # can be virtually everything (everything which is not
    # "atarist" or "atariste" at least should have a processor
    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
    # to the lowercase version "mint" (or "freemint").  Finally
    # the system name "TOS" denotes a system which is actually not
    # MiNT.  But MiNT is downward compatible to TOS, so this should
    # be no problem.
    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
        echo m68k-atari-mint${UNAME_RELEASE}
	exit ;;
    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
	echo m68k-atari-mint${UNAME_RELEASE}
        exit ;;
    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
        echo m68k-atari-mint${UNAME_RELEASE}
	exit ;;
    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
        echo m68k-milan-mint${UNAME_RELEASE}
        exit ;;
    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
        echo m68k-hades-mint${UNAME_RELEASE}
        exit ;;
    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
        echo m68k-unknown-mint${UNAME_RELEASE}
        exit ;;
    m68k:machten:*:*)
	echo m68k-apple-machten${UNAME_RELEASE}
	exit ;;
    powerpc:machten:*:*)
	echo powerpc-apple-machten${UNAME_RELEASE}
	exit ;;
    RISC*:Mach:*:*)
	echo mips-dec-mach_bsd4.3
	exit ;;
    RISC*:ULTRIX:*:*)
	echo mips-dec-ultrix${UNAME_RELEASE}
	exit ;;
    VAX*:ULTRIX*:*:*)
	echo vax-dec-ultrix${UNAME_RELEASE}
	exit ;;
    2020:CLIX:*:* | 2430:CLIX:*:*)
	echo clipper-intergraph-clix${UNAME_RELEASE}
	exit ;;
    mips:*:*:UMIPS | mips:*:*:RISCos)
	eval $set_cc_for_build
	sed 's/^	//' << EOF >$dummy.c
#ifdef __cplusplus
#include <stdio.h>  /* for printf() prototype */
	int main (int argc, char *argv[]) {
#else
	int main (argc, argv) int argc; char *argv[]; {
#endif
	#if defined (host_mips) && defined (MIPSEB)
	#if defined (SYSTYPE_SYSV)
	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
	#endif
	#if defined (SYSTYPE_SVR4)
	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
	#endif
	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
	#endif
	#endif
	  exit (-1);
	}
EOF
	$CC_FOR_BUILD -o $dummy $dummy.c &&
	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
	  SYSTEM_NAME=`$dummy $dummyarg` &&
	    { echo "$SYSTEM_NAME"; exit; }
	echo mips-mips-riscos${UNAME_RELEASE}
	exit ;;
    Motorola:PowerMAX_OS:*:*)
	echo powerpc-motorola-powermax
	exit ;;
    Motorola:*:4.3:PL8-*)
	echo powerpc-harris-powermax
	exit ;;
    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
	echo powerpc-harris-powermax
	exit ;;
    Night_Hawk:Power_UNIX:*:*)
	echo powerpc-harris-powerunix
	exit ;;
    m88k:CX/UX:7*:*)
	echo m88k-harris-cxux7
	exit ;;
    m88k:*:4*:R4*)
	echo m88k-motorola-sysv4
	exit ;;
    m88k:*:3*:R3*)
	echo m88k-motorola-sysv3
	exit ;;
    AViiON:dgux:*:*)
        # DG/UX returns AViiON for all architectures
        UNAME_PROCESSOR=`/usr/bin/uname -p`
	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
	then
	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
	       [ ${TARGET_BINARY_INTERFACE}x = x ]
	    then
		echo m88k-dg-dgux${UNAME_RELEASE}
	    else
		echo m88k-dg-dguxbcs${UNAME_RELEASE}
	    fi
	else
	    echo i586-dg-dgux${UNAME_RELEASE}
	fi
 	exit ;;
    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
	echo m88k-dolphin-sysv3
	exit ;;
    M88*:*:R3*:*)
	# Delta 88k system running SVR3
	echo m88k-motorola-sysv3
	exit ;;
    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
	echo m88k-tektronix-sysv3
	exit ;;
    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
	echo m68k-tektronix-bsd
	exit ;;
    *:IRIX*:*:*)
	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
	exit ;;
    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
    i*86:AIX:*:*)
	echo i386-ibm-aix
	exit ;;
    ia64:AIX:*:*)
	if [ -x /usr/bin/oslevel ] ; then
		IBM_REV=`/usr/bin/oslevel`
	else
		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
	fi
	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
	exit ;;
    *:AIX:2:3)
	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
		eval $set_cc_for_build
		sed 's/^		//' << EOF >$dummy.c
		#include <sys/systemcfg.h>

		main()
			{
			if (!__power_pc())
				exit(1);
			puts("powerpc-ibm-aix3.2.5");
			exit(0);
			}
EOF
		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
		then
			echo "$SYSTEM_NAME"
		else
			echo rs6000-ibm-aix3.2.5
		fi
	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
		echo rs6000-ibm-aix3.2.4
	else
		echo rs6000-ibm-aix3.2
	fi
	exit ;;
    *:AIX:*:[456])
	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
		IBM_ARCH=rs6000
	else
		IBM_ARCH=powerpc
	fi
	if [ -x /usr/bin/oslevel ] ; then
		IBM_REV=`/usr/bin/oslevel`
	else
		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
	fi
	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
	exit ;;
    *:AIX:*:*)
	echo rs6000-ibm-aix
	exit ;;
    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
	echo romp-ibm-bsd4.4
	exit ;;
    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
	exit ;;                             # report: romp-ibm BSD 4.3
    *:BOSX:*:*)
	echo rs6000-bull-bosx
	exit ;;
    DPX/2?00:B.O.S.:*:*)
	echo m68k-bull-sysv3
	exit ;;
    9000/[34]??:4.3bsd:1.*:*)
	echo m68k-hp-bsd
	exit ;;
    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
	echo m68k-hp-bsd4.4
	exit ;;
    9000/[34678]??:HP-UX:*:*)
	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
	case "${UNAME_MACHINE}" in
	    9000/31? )            HP_ARCH=m68000 ;;
	    9000/[34]?? )         HP_ARCH=m68k ;;
	    9000/[678][0-9][0-9])
		if [ -x /usr/bin/getconf ]; then
		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
                    case "${sc_cpu_version}" in
                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
                      532)                      # CPU_PA_RISC2_0
                        case "${sc_kernel_bits}" in
                          32) HP_ARCH="hppa2.0n" ;;
                          64) HP_ARCH="hppa2.0w" ;;
			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
                        esac ;;
                    esac
		fi
		if [ "${HP_ARCH}" = "" ]; then
		    eval $set_cc_for_build
		    sed 's/^              //' << EOF >$dummy.c

              #define _HPUX_SOURCE
              #include <stdlib.h>
              #include <unistd.h>

              int main ()
              {
              #if defined(_SC_KERNEL_BITS)
                  long bits = sysconf(_SC_KERNEL_BITS);
              #endif
                  long cpu  = sysconf (_SC_CPU_VERSION);

                  switch (cpu)
              	{
              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
              	case CPU_PA_RISC2_0:
              #if defined(_SC_KERNEL_BITS)
              	    switch (bits)
              		{
              		case 64: puts ("hppa2.0w"); break;
              		case 32: puts ("hppa2.0n"); break;
              		default: puts ("hppa2.0"); break;
              		} break;
              #else  /* !defined(_SC_KERNEL_BITS) */
              	    puts ("hppa2.0"); break;
              #endif
              	default: puts ("hppa1.0"); break;
              	}
                  exit (0);
              }
EOF
		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
		    test -z "$HP_ARCH" && HP_ARCH=hppa
		fi ;;
	esac
	if [ ${HP_ARCH} = "hppa2.0w" ]
	then
	    eval $set_cc_for_build

	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
	    # generating 64-bit code.  GNU and HP use different nomenclature:
	    #
	    # $ CC_FOR_BUILD=cc ./config.guess
	    # => hppa2.0w-hp-hpux11.23
	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
	    # => hppa64-hp-hpux11.23

	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
		grep -q __LP64__
	    then
		HP_ARCH="hppa2.0w"
	    else
		HP_ARCH="hppa64"
	    fi
	fi
	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
	exit ;;
    ia64:HP-UX:*:*)
	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
	echo ia64-hp-hpux${HPUX_REV}
	exit ;;
    3050*:HI-UX:*:*)
	eval $set_cc_for_build
	sed 's/^	//' << EOF >$dummy.c
	#include <unistd.h>
	int
	main ()
	{
	  long cpu = sysconf (_SC_CPU_VERSION);
	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
	     results, however.  */
	  if (CPU_IS_PA_RISC (cpu))
	    {
	      switch (cpu)
		{
		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
		  default: puts ("hppa-hitachi-hiuxwe2"); break;
		}
	    }
	  else if (CPU_IS_HP_MC68K (cpu))
	    puts ("m68k-hitachi-hiuxwe2");
	  else puts ("unknown-hitachi-hiuxwe2");
	  exit (0);
	}
EOF
	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
		{ echo "$SYSTEM_NAME"; exit; }
	echo unknown-hitachi-hiuxwe2
	exit ;;
    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
	echo hppa1.1-hp-bsd
	exit ;;
    9000/8??:4.3bsd:*:*)
	echo hppa1.0-hp-bsd
	exit ;;
    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
	echo hppa1.0-hp-mpeix
	exit ;;
    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
	echo hppa1.1-hp-osf
	exit ;;
    hp8??:OSF1:*:*)
	echo hppa1.0-hp-osf
	exit ;;
    i*86:OSF1:*:*)
	if [ -x /usr/sbin/sysversion ] ; then
	    echo ${UNAME_MACHINE}-unknown-osf1mk
	else
	    echo ${UNAME_MACHINE}-unknown-osf1
	fi
	exit ;;
    parisc*:Lites*:*:*)
	echo hppa1.1-hp-lites
	exit ;;
    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
	echo c1-convex-bsd
        exit ;;
    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
	if getsysinfo -f scalar_acc
	then echo c32-convex-bsd
	else echo c2-convex-bsd
	fi
        exit ;;
    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
	echo c34-convex-bsd
        exit ;;
    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
	echo c38-convex-bsd
        exit ;;
    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
	echo c4-convex-bsd
        exit ;;
    CRAY*Y-MP:*:*:*)
	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
	exit ;;
    CRAY*[A-Z]90:*:*:*)
	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
	      -e 's/\.[^.]*$/.X/'
	exit ;;
    CRAY*TS:*:*:*)
	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
	exit ;;
    CRAY*T3E:*:*:*)
	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
	exit ;;
    CRAY*SV1:*:*:*)
	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
	exit ;;
    *:UNICOS/mp:*:*)
	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
	exit ;;
    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
        exit ;;
    5000:UNIX_System_V:4.*:*)
        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
	exit ;;
    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
	exit ;;
    sparc*:BSD/OS:*:*)
	echo sparc-unknown-bsdi${UNAME_RELEASE}
	exit ;;
    *:BSD/OS:*:*)
	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
	exit ;;
    *:FreeBSD:*:*)
	case ${UNAME_MACHINE} in
	    pc98)
		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
	    amd64)
		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
	    *)
		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
	esac
	exit ;;
    i*:CYGWIN*:*)
	echo ${UNAME_MACHINE}-pc-cygwin
	exit ;;
    *:MINGW*:*)
	echo ${UNAME_MACHINE}-pc-mingw32
	exit ;;
    i*:windows32*:*)
    	# uname -m includes "-pc" on this system.
    	echo ${UNAME_MACHINE}-mingw32
	exit ;;
    i*:PW*:*)
	echo ${UNAME_MACHINE}-pc-pw32
	exit ;;
    *:Interix*:[3456]*)
    	case ${UNAME_MACHINE} in
	    x86)
		echo i586-pc-interix${UNAME_RELEASE}
		exit ;;
	    EM64T | authenticamd | genuineintel)
		echo x86_64-unknown-interix${UNAME_RELEASE}
		exit ;;
	    IA64)
		echo ia64-unknown-interix${UNAME_RELEASE}
		exit ;;
	esac ;;
    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
	echo i${UNAME_MACHINE}-pc-mks
	exit ;;
    8664:Windows_NT:*)
	echo x86_64-pc-mks
	exit ;;
    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
	# How do we know it's Interix rather than the generic POSIX subsystem?
	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
	# UNAME_MACHINE based on the output of uname instead of i386?
	echo i586-pc-interix
	exit ;;
    i*:UWIN*:*)
	echo ${UNAME_MACHINE}-pc-uwin
	exit ;;
    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
	echo x86_64-unknown-cygwin
	exit ;;
    p*:CYGWIN*:*)
	echo powerpcle-unknown-cygwin
	exit ;;
    prep*:SunOS:5.*:*)
	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
	exit ;;
    *:GNU:*:*)
	# the GNU system
	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
	exit ;;
    *:GNU/*:*:*)
	# other systems with GNU libc and userland
	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
	exit ;;
    i*86:Minix:*:*)
	echo ${UNAME_MACHINE}-pc-minix
	exit ;;
    alpha:Linux:*:*)
	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
	  EV5)   UNAME_MACHINE=alphaev5 ;;
	  EV56)  UNAME_MACHINE=alphaev56 ;;
	  PCA56) UNAME_MACHINE=alphapca56 ;;
	  PCA57) UNAME_MACHINE=alphapca56 ;;
	  EV6)   UNAME_MACHINE=alphaev6 ;;
	  EV67)  UNAME_MACHINE=alphaev67 ;;
	  EV68*) UNAME_MACHINE=alphaev68 ;;
        esac
	objdump --private-headers /bin/sh | grep -q ld.so.1
	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
	exit ;;
    arm*:Linux:*:*)
	eval $set_cc_for_build
	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
	    | grep -q __ARM_EABI__
	then
	    echo ${UNAME_MACHINE}-unknown-linux-gnu
	else
	    echo ${UNAME_MACHINE}-unknown-linux-gnueabi
	fi
	exit ;;
    avr32*:Linux:*:*)
	echo ${UNAME_MACHINE}-unknown-linux-gnu
	exit ;;
    cris:Linux:*:*)
	echo cris-axis-linux-gnu
	exit ;;
    crisv32:Linux:*:*)
	echo crisv32-axis-linux-gnu
	exit ;;
    frv:Linux:*:*)
    	echo frv-unknown-linux-gnu
	exit ;;
    i*86:Linux:*:*)
	echo ${UNAME_MACHINE}-pc-linux-gnu
	exit ;;
    ia64:Linux:*:*)
	echo ${UNAME_MACHINE}-unknown-linux-gnu
	exit ;;
    m32r*:Linux:*:*)
	echo ${UNAME_MACHINE}-unknown-linux-gnu
	exit ;;
    m68*:Linux:*:*)
	echo ${UNAME_MACHINE}-unknown-linux-gnu
	exit ;;
    mips:Linux:*:* | mips64:Linux:*:*)
	eval $set_cc_for_build
	sed 's/^	//' << EOF >$dummy.c
	#undef CPU
	#undef ${UNAME_MACHINE}
	#undef ${UNAME_MACHINE}el
	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
	CPU=${UNAME_MACHINE}el
	#else
	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
	CPU=${UNAME_MACHINE}
	#else
	CPU=
	#endif
	#endif
EOF
	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
	    /^CPU/{
		s: ::g
		p
	    }'`"
	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
	;;
    or32:Linux:*:*)
	echo or32-unknown-linux-gnu
	exit ;;
    padre:Linux:*:*)
	echo sparc-unknown-linux-gnu
	exit ;;
    parisc64:Linux:*:* | hppa64:Linux:*:*)
	echo hppa64-unknown-linux-gnu
	exit ;;
    parisc:Linux:*:* | hppa:Linux:*:*)
	# Look for CPU level
	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
	  *)    echo hppa-unknown-linux-gnu ;;
	esac
	exit ;;
    ppc64:Linux:*:*)
	echo powerpc64-unknown-linux-gnu
	exit ;;
    ppc:Linux:*:*)
	echo powerpc-unknown-linux-gnu
	exit ;;
    s390:Linux:*:* | s390x:Linux:*:*)
	echo ${UNAME_MACHINE}-ibm-linux
	exit ;;
    sh64*:Linux:*:*)
    	echo ${UNAME_MACHINE}-unknown-linux-gnu
	exit ;;
    sh*:Linux:*:*)
	echo ${UNAME_MACHINE}-unknown-linux-gnu
	exit ;;
    sparc:Linux:*:* | sparc64:Linux:*:*)
	echo ${UNAME_MACHINE}-unknown-linux-gnu
	exit ;;
    vax:Linux:*:*)
	echo ${UNAME_MACHINE}-dec-linux-gnu
	exit ;;
    x86_64:Linux:*:*)
	echo x86_64-unknown-linux-gnu
	exit ;;
    xtensa*:Linux:*:*)
    	echo ${UNAME_MACHINE}-unknown-linux-gnu
	exit ;;
    i*86:DYNIX/ptx:4*:*)
	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
	# earlier versions are messed up and put the nodename in both
	# sysname and nodename.
	echo i386-sequent-sysv4
	exit ;;
    i*86:UNIX_SV:4.2MP:2.*)
        # Unixware is an offshoot of SVR4, but it has its own version
        # number series starting with 2...
        # I am not positive that other SVR4 systems won't match this,
	# I just have to hope.  -- rms.
        # Use sysv4.2uw... so that sysv4* matches it.
	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
	exit ;;
    i*86:OS/2:*:*)
	# If we were able to find `uname', then EMX Unix compatibility
	# is probably installed.
	echo ${UNAME_MACHINE}-pc-os2-emx
	exit ;;
    i*86:XTS-300:*:STOP)
	echo ${UNAME_MACHINE}-unknown-stop
	exit ;;
    i*86:atheos:*:*)
	echo ${UNAME_MACHINE}-unknown-atheos
	exit ;;
    i*86:syllable:*:*)
	echo ${UNAME_MACHINE}-pc-syllable
	exit ;;
    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
	echo i386-unknown-lynxos${UNAME_RELEASE}
	exit ;;
    i*86:*DOS:*:*)
	echo ${UNAME_MACHINE}-pc-msdosdjgpp
	exit ;;
    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
	else
		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
	fi
	exit ;;
    i*86:*:5:[678]*)
    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
	case `/bin/uname -X | grep "^Machine"` in
	    *486*)	     UNAME_MACHINE=i486 ;;
	    *Pentium)	     UNAME_MACHINE=i586 ;;
	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
	esac
	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
	exit ;;
    i*86:*:3.2:*)
	if test -f /usr/options/cb.name; then
		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
	elif /bin/uname -X 2>/dev/null >/dev/null ; then
		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
			&& UNAME_MACHINE=i586
		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
			&& UNAME_MACHINE=i686
		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
			&& UNAME_MACHINE=i686
		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
	else
		echo ${UNAME_MACHINE}-pc-sysv32
	fi
	exit ;;
    pc:*:*:*)
	# Left here for compatibility:
        # uname -m prints for DJGPP always 'pc', but it prints nothing about
        # the processor, so we play safe by assuming i586.
	# Note: whatever this is, it MUST be the same as what config.sub
	# prints for the "djgpp" host, or else GDB configury will decide that
	# this is a cross-build.
	echo i586-pc-msdosdjgpp
        exit ;;
    Intel:Mach:3*:*)
	echo i386-pc-mach3
	exit ;;
    paragon:*:*:*)
	echo i860-intel-osf1
	exit ;;
    i860:*:4.*:*) # i860-SVR4
	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
	else # Add other i860-SVR4 vendors below as they are discovered.
	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
	fi
	exit ;;
    mini*:CTIX:SYS*5:*)
	# "miniframe"
	echo m68010-convergent-sysv
	exit ;;
    mc68k:UNIX:SYSTEM5:3.51m)
	echo m68k-convergent-sysv
	exit ;;
    M680?0:D-NIX:5.3:*)
	echo m68k-diab-dnix
	exit ;;
    M68*:*:R3V[5678]*:*)
	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
	OS_REL=''
	test -r /etc/.relid \
	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
          && { echo i486-ncr-sysv4; exit; } ;;
    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
	OS_REL='.3'
	test -r /etc/.relid \
	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
	echo m68k-unknown-lynxos${UNAME_RELEASE}
	exit ;;
    mc68030:UNIX_System_V:4.*:*)
	echo m68k-atari-sysv4
	exit ;;
    TSUNAMI:LynxOS:2.*:*)
	echo sparc-unknown-lynxos${UNAME_RELEASE}
	exit ;;
    rs6000:LynxOS:2.*:*)
	echo rs6000-unknown-lynxos${UNAME_RELEASE}
	exit ;;
    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
	echo powerpc-unknown-lynxos${UNAME_RELEASE}
	exit ;;
    SM[BE]S:UNIX_SV:*:*)
	echo mips-dde-sysv${UNAME_RELEASE}
	exit ;;
    RM*:ReliantUNIX-*:*:*)
	echo mips-sni-sysv4
	exit ;;
    RM*:SINIX-*:*:*)
	echo mips-sni-sysv4
	exit ;;
    *:SINIX-*:*:*)
	if uname -p 2>/dev/null >/dev/null ; then
		UNAME_MACHINE=`(uname -p) 2>/dev/null`
		echo ${UNAME_MACHINE}-sni-sysv4
	else
		echo ns32k-sni-sysv
	fi
	exit ;;
    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
                      # says <Richard.M.Bartel@ccMail.Census.GOV>
        echo i586-unisys-sysv4
        exit ;;
    *:UNIX_System_V:4*:FTX*)
	# From Gerald Hewes <hewes@openmarket.com>.
	# How about differentiating between stratus architectures? -djm
	echo hppa1.1-stratus-sysv4
	exit ;;
    *:*:*:FTX*)
	# From seanf@swdc.stratus.com.
	echo i860-stratus-sysv4
	exit ;;
    i*86:VOS:*:*)
	# From Paul.Green@stratus.com.
	echo ${UNAME_MACHINE}-stratus-vos
	exit ;;
    *:VOS:*:*)
	# From Paul.Green@stratus.com.
	echo hppa1.1-stratus-vos
	exit ;;
    mc68*:A/UX:*:*)
	echo m68k-apple-aux${UNAME_RELEASE}
	exit ;;
    news*:NEWS-OS:6*:*)
	echo mips-sony-newsos6
	exit ;;
    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
	if [ -d /usr/nec ]; then
	        echo mips-nec-sysv${UNAME_RELEASE}
	else
	        echo mips-unknown-sysv${UNAME_RELEASE}
	fi
        exit ;;
    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
	echo powerpc-be-beos
	exit ;;
    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
	echo powerpc-apple-beos
	exit ;;
    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
	echo i586-pc-beos
	exit ;;
    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
	echo i586-pc-haiku
	exit ;;
    SX-4:SUPER-UX:*:*)
	echo sx4-nec-superux${UNAME_RELEASE}
	exit ;;
    SX-5:SUPER-UX:*:*)
	echo sx5-nec-superux${UNAME_RELEASE}
	exit ;;
    SX-6:SUPER-UX:*:*)
	echo sx6-nec-superux${UNAME_RELEASE}
	exit ;;
    SX-7:SUPER-UX:*:*)
	echo sx7-nec-superux${UNAME_RELEASE}
	exit ;;
    SX-8:SUPER-UX:*:*)
	echo sx8-nec-superux${UNAME_RELEASE}
	exit ;;
    SX-8R:SUPER-UX:*:*)
	echo sx8r-nec-superux${UNAME_RELEASE}
	exit ;;
    Power*:Rhapsody:*:*)
	echo powerpc-apple-rhapsody${UNAME_RELEASE}
	exit ;;
    *:Rhapsody:*:*)
	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
	exit ;;
    *:Darwin:*:*)
	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
	case $UNAME_PROCESSOR in
	    i386)
		eval $set_cc_for_build
		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
		      grep IS_64BIT_ARCH >/dev/null
		  then
		      UNAME_PROCESSOR="x86_64"
		  fi
		fi ;;
	    unknown) UNAME_PROCESSOR=powerpc ;;
	esac
	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
	exit ;;
    *:procnto*:*:* | *:QNX:[0123456789]*:*)
	UNAME_PROCESSOR=`uname -p`
	if test "$UNAME_PROCESSOR" = "x86"; then
		UNAME_PROCESSOR=i386
		UNAME_MACHINE=pc
	fi
	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
	exit ;;
    *:QNX:*:4*)
	echo i386-pc-qnx
	exit ;;
    NSE-?:NONSTOP_KERNEL:*:*)
	echo nse-tandem-nsk${UNAME_RELEASE}
	exit ;;
    NSR-?:NONSTOP_KERNEL:*:*)
	echo nsr-tandem-nsk${UNAME_RELEASE}
	exit ;;
    *:NonStop-UX:*:*)
	echo mips-compaq-nonstopux
	exit ;;
    BS2000:POSIX*:*:*)
	echo bs2000-siemens-sysv
	exit ;;
    DS/*:UNIX_System_V:*:*)
	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
	exit ;;
    *:Plan9:*:*)
	# "uname -m" is not consistent, so use $cputype instead. 386
	# is converted to i386 for consistency with other x86
	# operating systems.
	if test "$cputype" = "386"; then
	    UNAME_MACHINE=i386
	else
	    UNAME_MACHINE="$cputype"
	fi
	echo ${UNAME_MACHINE}-unknown-plan9
	exit ;;
    *:TOPS-10:*:*)
	echo pdp10-unknown-tops10
	exit ;;
    *:TENEX:*:*)
	echo pdp10-unknown-tenex
	exit ;;
    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
	echo pdp10-dec-tops20
	exit ;;
    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
	echo pdp10-xkl-tops20
	exit ;;
    *:TOPS-20:*:*)
	echo pdp10-unknown-tops20
	exit ;;
    *:ITS:*:*)
	echo pdp10-unknown-its
	exit ;;
    SEI:*:*:SEIUX)
        echo mips-sei-seiux${UNAME_RELEASE}
	exit ;;
    *:DragonFly:*:*)
	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
	exit ;;
    *:*VMS:*:*)
    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
	case "${UNAME_MACHINE}" in
	    A*) echo alpha-dec-vms ; exit ;;
	    I*) echo ia64-dec-vms ; exit ;;
	    V*) echo vax-dec-vms ; exit ;;
	esac ;;
    *:XENIX:*:SysV)
	echo i386-pc-xenix
	exit ;;
    i*86:skyos:*:*)
	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
	exit ;;
    i*86:rdos:*:*)
	echo ${UNAME_MACHINE}-pc-rdos
	exit ;;
    i*86:AROS:*:*)
	echo ${UNAME_MACHINE}-pc-aros
	exit ;;
esac

#echo '(No uname command or uname output not recognized.)' 1>&2
#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2

eval $set_cc_for_build
cat >$dummy.c <<EOF
#ifdef _SEQUENT_
# include <sys/types.h>
# include <sys/utsname.h>
#endif
main ()
{
#if defined (sony)
#if defined (MIPSEB)
  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
     I don't know....  */
  printf ("mips-sony-bsd\n"); exit (0);
#else
#include <sys/param.h>
  printf ("m68k-sony-newsos%s\n",
#ifdef NEWSOS4
          "4"
#else
	  ""
#endif
         ); exit (0);
#endif
#endif

#if defined (__arm) && defined (__acorn) && defined (__unix)
  printf ("arm-acorn-riscix\n"); exit (0);
#endif

#if defined (hp300) && !defined (hpux)
  printf ("m68k-hp-bsd\n"); exit (0);
#endif

#if defined (NeXT)
#if !defined (__ARCHITECTURE__)
#define __ARCHITECTURE__ "m68k"
#endif
  int version;
  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
  if (version < 4)
    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
  else
    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
  exit (0);
#endif

#if defined (MULTIMAX) || defined (n16)
#if defined (UMAXV)
  printf ("ns32k-encore-sysv\n"); exit (0);
#else
#if defined (CMU)
  printf ("ns32k-encore-mach\n"); exit (0);
#else
  printf ("ns32k-encore-bsd\n"); exit (0);
#endif
#endif
#endif

#if defined (__386BSD__)
  printf ("i386-pc-bsd\n"); exit (0);
#endif

#if defined (sequent)
#if defined (i386)
  printf ("i386-sequent-dynix\n"); exit (0);
#endif
#if defined (ns32000)
  printf ("ns32k-sequent-dynix\n"); exit (0);
#endif
#endif

#if defined (_SEQUENT_)
    struct utsname un;

    uname(&un);

    if (strncmp(un.version, "V2", 2) == 0) {
	printf ("i386-sequent-ptx2\n"); exit (0);
    }
    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
	printf ("i386-sequent-ptx1\n"); exit (0);
    }
    printf ("i386-sequent-ptx\n"); exit (0);

#endif

#if defined (vax)
# if !defined (ultrix)
#  include <sys/param.h>
#  if defined (BSD)
#   if BSD == 43
      printf ("vax-dec-bsd4.3\n"); exit (0);
#   else
#    if BSD == 199006
      printf ("vax-dec-bsd4.3reno\n"); exit (0);
#    else
      printf ("vax-dec-bsd\n"); exit (0);
#    endif
#   endif
#  else
    printf ("vax-dec-bsd\n"); exit (0);
#  endif
# else
    printf ("vax-dec-ultrix\n"); exit (0);
# endif
#endif

#if defined (alliant) && defined (i860)
  printf ("i860-alliant-bsd\n"); exit (0);
#endif

  exit (1);
}
EOF

$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
	{ echo "$SYSTEM_NAME"; exit; }

# Apollos put the system type in the environment.

test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }

# Convex versions that predate uname can use getsysinfo(1)

if [ -x /usr/convex/getsysinfo ]
then
    case `getsysinfo -f cpu_type` in
    c1*)
	echo c1-convex-bsd
	exit ;;
    c2*)
	if getsysinfo -f scalar_acc
	then echo c32-convex-bsd
	else echo c2-convex-bsd
	fi
	exit ;;
    c34*)
	echo c34-convex-bsd
	exit ;;
    c38*)
	echo c38-convex-bsd
	exit ;;
    c4*)
	echo c4-convex-bsd
	exit ;;
    esac
fi

cat >&2 <<EOF
$0: unable to guess system type

This script, last modified $timestamp, has failed to recognize
the operating system you are using. It is advised that you
download the most up to date version of the config scripts from

  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
and
  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD

If the version you run ($0) is already up to date, please
send the following data and any information you think might be
pertinent to <config-patches@gnu.org> in order to provide the needed
information to handle your system.

config.guess timestamp = $timestamp

uname -m = `(uname -m) 2>/dev/null || echo unknown`
uname -r = `(uname -r) 2>/dev/null || echo unknown`
uname -s = `(uname -s) 2>/dev/null || echo unknown`
uname -v = `(uname -v) 2>/dev/null || echo unknown`

/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`

hostinfo               = `(hostinfo) 2>/dev/null`
/bin/universe          = `(/bin/universe) 2>/dev/null`
/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
/bin/arch              = `(/bin/arch) 2>/dev/null`
/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`

UNAME_MACHINE = ${UNAME_MACHINE}
UNAME_RELEASE = ${UNAME_RELEASE}
UNAME_SYSTEM  = ${UNAME_SYSTEM}
UNAME_VERSION = ${UNAME_VERSION}
EOF

exit 1

# Local variables:
# eval: (add-hook 'write-file-hooks 'time-stamp)
# time-stamp-start: "timestamp='"
# time-stamp-format: "%:y-%02m-%02d"
# time-stamp-end: "'"
# End:
x264-snapshot-20120103-2245-stable/common/0000755000175000001440000000000011700673342017161 5ustar  videolanusersx264-snapshot-20120103-2245-stable/common/set.c0000644000175000001440000003202211700673342020117 0ustar  videolanusers/*****************************************************************************
 * set.c: quantization init
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#define _ISOC99_SOURCE
#include "common.h"

#define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s))
#define DIV(n,d) (((n) + ((d)>>1)) / (d))

static const uint8_t dequant4_scale[6][3] =
{
    { 10, 13, 16 },
    { 11, 14, 18 },
    { 13, 16, 20 },
    { 14, 18, 23 },
    { 16, 20, 25 },
    { 18, 23, 29 }
};
static const uint16_t quant4_scale[6][3] =
{
    { 13107, 8066, 5243 },
    { 11916, 7490, 4660 },
    { 10082, 6554, 4194 },
    {  9362, 5825, 3647 },
    {  8192, 5243, 3355 },
    {  7282, 4559, 2893 },
};

static const uint8_t quant8_scan[16] =
{
    0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
};
static const uint8_t dequant8_scale[6][6] =
{
    { 20, 18, 32, 19, 25, 24 },
    { 22, 19, 35, 21, 28, 26 },
    { 26, 23, 42, 24, 33, 31 },
    { 28, 25, 45, 26, 35, 33 },
    { 32, 28, 51, 30, 40, 38 },
    { 36, 32, 58, 34, 46, 43 },
};
static const uint16_t quant8_scale[6][6] =
{
    { 13107, 11428, 20972, 12222, 16777, 15481 },
    { 11916, 10826, 19174, 11058, 14980, 14290 },
    { 10082,  8943, 15978,  9675, 12710, 11985 },
    {  9362,  8228, 14913,  8931, 11984, 11259 },
    {  8192,  7346, 13159,  7740, 10486,  9777 },
    {  7282,  6428, 11570,  6830,  9118,  8640 }
};

int x264_cqm_init( x264_t *h )
{
    int def_quant4[6][16];
    int def_quant8[6][64];
    int def_dequant4[6][16];
    int def_dequant8[6][64];
    int quant4_mf[4][6][16];
    int quant8_mf[4][6][64];
    int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1],
                        32 - h->param.analyse.i_luma_deadzone[0],
                        32 - 11, 32 - 21 };
    int max_qp_err = -1;
    int max_chroma_qp_err = -1;
    int min_qp_err = QP_MAX+1;
    int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 : 2; /* Checkasm may segfault if optimized out by --chroma-format */

    for( int i = 0; i < 4 + num_8x8_lists; i++ )
    {
        int size = i<4 ? 16 : 64;
        int j;
        for( j = (i<4 ? 0 : 4); j < i; j++ )
            if( !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
                break;
        if( j < i )
        {
            h->  quant4_mf[i] = h->  quant4_mf[j];
            h->dequant4_mf[i] = h->dequant4_mf[j];
            h->unquant4_mf[i] = h->unquant4_mf[j];
        }
        else
        {
            CHECKED_MALLOC( h->  quant4_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );
            CHECKED_MALLOC( h->dequant4_mf[i],  6*size*sizeof(int) );
            CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) );
        }

        for( j = (i<4 ? 0 : 4); j < i; j++ )
            if( deadzone[j&3] == deadzone[i&3] &&
                !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
                break;
        if( j < i )
            h->quant4_bias[i] = h->quant4_bias[j];
        else
            CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );
    }

    for( int q = 0; q < 6; q++ )
    {
        for( int i = 0; i < 16; i++ )
        {
            int j = (i&1) + ((i>>2)&1);
            def_dequant4[q][i] = dequant4_scale[q][j];
            def_quant4[q][i]   =   quant4_scale[q][j];
        }
        for( int i = 0; i < 64; i++ )
        {
            int j = quant8_scan[((i>>1)&12) | (i&3)];
            def_dequant8[q][i] = dequant8_scale[q][j];
            def_quant8[q][i]   =   quant8_scale[q][j];
        }
    }

    for( int q = 0; q < 6; q++ )
    {
        for( int i_list = 0; i_list < 4; i_list++ )
            for( int i = 0; i < 16; i++ )
            {
                h->dequant4_mf[i_list][q][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
                     quant4_mf[i_list][q][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
            }
        for( int i_list = 0; i_list < num_8x8_lists; i_list++ )
            for( int i = 0; i < 64; i++ )
            {
                h->dequant8_mf[i_list][q][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
                     quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
            }
    }
    for( int q = 0; q < QP_MAX+1; q++ )
    {
        int j;
        for( int i_list = 0; i_list < 4; i_list++ )
            for( int i = 0; i < 16; i++ )
            {
                h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i];
                h->quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1);
                if( !j )
                {
                    min_qp_err = X264_MIN( min_qp_err, q );
                    continue;
                }
                // round to nearest, unless that would cause the deadzone to be negative
                h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) )
                    max_qp_err = q;
                if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_4IC || i_list == CQM_4PC) )
                    max_chroma_qp_err = q;
            }
        if( h->param.analyse.b_transform_8x8 )
            for( int i_list = 0; i_list < num_8x8_lists; i_list++ )
                for( int i = 0; i < 64; i++ )
                {
                    h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
                    j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
                    h->quant8_mf[i_list][q][i] = (uint16_t)j;

                    if( !j )
                    {
                        min_qp_err = X264_MIN( min_qp_err, q );
                        continue;
                    }
                    h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                    if( j > 0xffff && q > max_qp_err && (i_list == CQM_8IY || i_list == CQM_8PY) )
                        max_qp_err = q;
                    if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_8IC || i_list == CQM_8PC) )
                        max_chroma_qp_err = q;
                }
    }

    /* Emergency mode denoising. */
    x264_emms();
    CHECKED_MALLOC( h->nr_offset_emergency, sizeof(*h->nr_offset_emergency)*(QP_MAX-QP_MAX_SPEC) );
    for( int q = 0; q < QP_MAX - QP_MAX_SPEC; q++ )
        for( int cat = 0; cat < 3 + CHROMA444; cat++ )
        {
            int dct8x8 = cat&1;
            int size = dct8x8 ? 64 : 16;
            udctcoef *nr_offset = h->nr_offset_emergency[q][cat];
            /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */
            int dc_threshold =    (QP_MAX-QP_MAX_SPEC)*2/3;
            int luma_threshold =  (QP_MAX-QP_MAX_SPEC)*2/3;
            int chroma_threshold = 0;

            for( int i = 0; i < size; i++ )
            {
                int max = (1 << (7 + BIT_DEPTH)) - 1;
                /* True "emergency mode": remove all DCT coefficients */
                if( q == QP_MAX - QP_MAX_SPEC - 1 )
                {
                    nr_offset[i] = max;
                    continue;
                }

                int thresh = i == 0 ? dc_threshold : cat >= 2 ? chroma_threshold : luma_threshold;
                if( q < thresh )
                {
                    nr_offset[i] = 0;
                    continue;
                }
                double pos = (double)(q-thresh+1) / (QP_MAX - QP_MAX_SPEC - thresh);

                /* XXX: this math is largely tuned for /dev/random input. */
                double start = dct8x8 ? h->unquant8_mf[CQM_8PY][QP_MAX_SPEC][i]
                                      : h->unquant4_mf[CQM_4PY][QP_MAX_SPEC][i];
                /* Formula chosen as an exponential scale to vaguely mimic the effects
                 * of a higher quantizer. */
                double bias = (pow( 2, pos*(QP_MAX - QP_MAX_SPEC)/10. )*0.003-0.003) * start;
                nr_offset[i] = X264_MIN( bias + 0.5, max );
            }
        }

    if( !h->mb.b_lossless )
    {
        while( h->chroma_qp_table[h->param.rc.i_qp_min] <= max_chroma_qp_err )
            h->param.rc.i_qp_min++;
        if( min_qp_err <= h->param.rc.i_qp_max )
            h->param.rc.i_qp_max = min_qp_err-1;
        if( max_qp_err >= h->param.rc.i_qp_min )
            h->param.rc.i_qp_min = max_qp_err+1;
        /* If long level-codes aren't allowed, we need to allow QP high enough to avoid them. */
        if( !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH )
            while( h->chroma_qp_table[SPEC_QP(h->param.rc.i_qp_max)] <= 12 || h->param.rc.i_qp_max <= 12 )
                h->param.rc.i_qp_max++;
        if( h->param.rc.i_qp_min > h->param.rc.i_qp_max )
        {
            x264_log( h, X264_LOG_ERROR, "Impossible QP constraints for CQM (min=%d, max=%d)\n", h->param.rc.i_qp_min, h->param.rc.i_qp_max );
            return -1;
        }
    }
    return 0;
fail:
    x264_cqm_delete( h );
    return -1;
}

#define CQM_DELETE( n, max )\
    for( int i = 0; i < (max); i++ )\
    {\
        int j;\
        for( j = 0; j < i; j++ )\
            if( h->quant##n##_mf[i] == h->quant##n##_mf[j] )\
                break;\
        if( j == i )\
        {\
            x264_free( h->  quant##n##_mf[i] );\
            x264_free( h->dequant##n##_mf[i] );\
            x264_free( h->unquant##n##_mf[i] );\
        }\
        for( j = 0; j < i; j++ )\
            if( h->quant##n##_bias[i] == h->quant##n##_bias[j] )\
                break;\
        if( j == i )\
            x264_free( h->quant##n##_bias[i] );\
    }

void x264_cqm_delete( x264_t *h )
{
    CQM_DELETE( 4, 4 );
    CQM_DELETE( 8, CHROMA444 ? 4 : 2 );
    x264_free( h->nr_offset_emergency );
}

static int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name,
                                  uint8_t *cqm, const uint8_t *jvt, int length )
{
    int i;

    char *p = strstr( buf, name );
    if( !p )
    {
        memset( cqm, 16, length );
        return 0;
    }

    p += strlen( name );
    if( *p == 'U' || *p == 'V' )
        p++;

    char *nextvar = strstr( p, "INT" );

    for( i = 0; i < length && (p = strpbrk( p, " \t\n," )) && (p = strpbrk( p, "0123456789" )); i++ )
    {
        int coef = -1;
        sscanf( p, "%d", &coef );
        if( i == 0 && coef == 0 )
        {
            memcpy( cqm, jvt, length );
            return 0;
        }
        if( coef < 1 || coef > 255 )
        {
            x264_log( h, X264_LOG_ERROR, "bad coefficient in list '%s'\n", name );
            return -1;
        }
        cqm[i] = coef;
    }

    if( (nextvar && p > nextvar) || i != length )
    {
        x264_log( h, X264_LOG_ERROR, "not enough coefficients in list '%s'\n", name );
        return -1;
    }

    return 0;
}

int x264_cqm_parse_file( x264_t *h, const char *filename )
{
    char *p;
    int b_error = 0;

    h->param.i_cqm_preset = X264_CQM_CUSTOM;

    char *buf = x264_slurp_file( filename );
    if( !buf )
    {
        x264_log( h, X264_LOG_ERROR, "can't open file '%s'\n", filename );
        return -1;
    }

    while( (p = strchr( buf, '#' )) != NULL )
        memset( p, ' ', strcspn( p, "\n" ) );

    b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA4X4_LUMA",   h->param.cqm_4iy, x264_cqm_jvt4i, 16 );
    b_error |= x264_cqm_parse_jmlist( h, buf, "INTER4X4_LUMA",   h->param.cqm_4py, x264_cqm_jvt4p, 16 );
    b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA4X4_CHROMA", h->param.cqm_4ic, x264_cqm_jvt4i, 16 );
    b_error |= x264_cqm_parse_jmlist( h, buf, "INTER4X4_CHROMA", h->param.cqm_4pc, x264_cqm_jvt4p, 16 );
    b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_LUMA",   h->param.cqm_8iy, x264_cqm_jvt8i, 64 );
    b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_LUMA",   h->param.cqm_8py, x264_cqm_jvt8p, 64 );
    if( CHROMA444 )
    {
        b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_CHROMA", h->param.cqm_8iy, x264_cqm_jvt8i, 64 );
        b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_CHROMA", h->param.cqm_8py, x264_cqm_jvt8p, 64 );
    }

    x264_free( buf );
    return b_error;
}

x264-snapshot-20120103-2245-stable/common/frame.h0000644000175000001440000002354411700673342020434 0ustar  videolanusers/*****************************************************************************
 * frame.h: frame handling
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_FRAME_H
#define X264_FRAME_H

/* number of pixels past the edge of the frame, for motion estimation/compensation */
#define PADH 32
#define PADV 32

typedef struct x264_frame
{
    /* */
    int     i_poc;
    int     i_delta_poc[2];
    int     i_type;
    int     i_qpplus1;
    int64_t i_pts;
    int64_t i_dts;
    int64_t i_reordered_pts;
    int64_t i_duration;  /* in SPS time_scale units (i.e 2 * timebase units) used for vfr */
    float   f_duration;  /* in seconds */
    int64_t i_cpb_duration;
    int64_t i_cpb_delay; /* in SPS time_scale units (i.e 2 * timebase units) */
    int64_t i_dpb_output_delay;
    x264_param_t *param;

    int     i_frame;     /* Presentation frame number */
    int     i_coded;     /* Coded frame number */
    int64_t i_field_cnt; /* Presentation field count */
    int     i_frame_num; /* 7.4.3 frame_num */
    int     b_kept_as_ref;
    int     i_pic_struct;
    int     b_keyframe;
    uint8_t b_fdec;
    uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
    uint8_t i_bframes;   /* number of bframes following this nonb in coded order */
    float   f_qp_avg_rc; /* QPs as decided by ratecontrol */
    float   f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
    int     i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */

    /* YUV buffer */
    int     i_csp; /* Internal csp */
    int     i_plane;
    int     i_stride[3];
    int     i_width[3];
    int     i_lines[3];
    int     i_stride_lowres;
    int     i_width_lowres;
    int     i_lines_lowres;
    pixel *plane[3];
    pixel *plane_fld[3];
    pixel *filtered[3][4]; /* plane[0], H, V, HV */
    pixel *filtered_fld[3][4];
    pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
    uint16_t *integral;

    /* for unrestricted mv we allocate more data than needed
     * allocated data are stored in buffer */
    pixel *buffer[4];
    pixel *buffer_fld[4];
    pixel *buffer_lowres[4];

    x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */
    pixel *weighted[X264_REF_MAX]; /* plane[0] weighted of the reference frames */
    int b_duplicate;
    struct x264_frame *orig;

    /* motion data */
    int8_t  *mb_type;
    uint8_t *mb_partition;
    int16_t (*mv[2])[2];
    int16_t (*mv16x16)[2];
    int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
    uint8_t *field;

    /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
     * Doesn't need special addressing for intra cost because
     * lists_used is guaranteed to be zero in that cast. */
    uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
    #define LOWRES_COST_MASK ((1<<14)-1)
    #define LOWRES_COST_SHIFT 14

    int     *lowres_mv_costs[2][X264_BFRAME_MAX+1];
    int8_t  *ref[2];
    int     i_ref[2];
    int     ref_poc[2][X264_REF_MAX];
    int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction

    /* for adaptive B-frame decision.
     * contains the SATD cost of the lowres frame encoded in various modes
     * FIXME: how big an array do we need? */
    int     i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
    int     i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
    int     i_satd; // the i_cost_est of the selected frametype
    int     i_intra_mbs[X264_BFRAME_MAX+2];
    int     *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
    int     *i_row_satd;
    int     *i_row_bits;
    float   *f_row_qp;
    float   *f_row_qscale;
    float   *f_qp_offset;
    float   *f_qp_offset_aq;
    int     b_intra_calculated;
    uint16_t *i_intra_cost;
    uint16_t *i_propagate_cost;
    uint16_t *i_inv_qscale_factor;
    int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
    float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
    uint32_t i_pixel_sum[3];
    uint64_t i_pixel_ssd[3];

    /* hrd */
    x264_hrd_t hrd_timing;

    /* vbv */
    uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
    int i_planned_satd[X264_LOOKAHEAD_MAX+1];
    double f_planned_cpb_duration[X264_LOOKAHEAD_MAX+1];
    int64_t i_coded_fields_lookahead;
    int64_t i_cpb_delay_lookahead;

    /* threading */
    int     i_lines_completed; /* in pixels */
    int     i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
    int     i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
    x264_pthread_mutex_t mutex;
    x264_pthread_cond_t  cv;

    /* periodic intra refresh */
    float   f_pir_position;
    int     i_pir_start_col;
    int     i_pir_end_col;
    int     i_frames_since_pir;

    /* interactive encoder control */
    int     b_corrupt;

    /* user sei */
    x264_sei_t extra_sei;
} x264_frame_t;

/* synchronized frame list */
typedef struct
{
   x264_frame_t **list;
   int i_max_size;
   int i_size;
   x264_pthread_mutex_t     mutex;
   x264_pthread_cond_t      cv_fill;  /* event signaling that the list became fuller */
   x264_pthread_cond_t      cv_empty; /* event signaling that the list became emptier */
} x264_sync_frame_list_t;

typedef void (*x264_deblock_inter_t)( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
typedef void (*x264_deblock_intra_t)( pixel *pix, int stride, int alpha, int beta );
typedef struct
{
    x264_deblock_inter_t deblock_luma[2];
    x264_deblock_inter_t deblock_chroma[2];
    x264_deblock_inter_t deblock_h_chroma_420;
    x264_deblock_inter_t deblock_h_chroma_422;
    x264_deblock_intra_t deblock_luma_intra[2];
    x264_deblock_intra_t deblock_chroma_intra[2];
    x264_deblock_intra_t deblock_h_chroma_420_intra;
    x264_deblock_intra_t deblock_h_chroma_422_intra;
    x264_deblock_inter_t deblock_luma_mbaff;
    x264_deblock_inter_t deblock_chroma_mbaff;
    x264_deblock_inter_t deblock_chroma_420_mbaff;
    x264_deblock_inter_t deblock_chroma_422_mbaff;
    x264_deblock_intra_t deblock_luma_intra_mbaff;
    x264_deblock_intra_t deblock_chroma_intra_mbaff;
    x264_deblock_intra_t deblock_chroma_420_intra_mbaff;
    x264_deblock_intra_t deblock_chroma_422_intra_mbaff;
    void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                               int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
                               int bframe );
} x264_deblock_function_t;

void          x264_frame_delete( x264_frame_t *frame );

int           x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );

void          x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
void          x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
void          x264_frame_expand_border_lowres( x264_frame_t *frame );
void          x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane );
void          x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
void          x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y );

void          x264_frame_deblock_row( x264_t *h, int mb_y );
void          x264_macroblock_deblock( x264_t *h );

void          x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
void          x264_frame_init_lowres( x264_t *h, x264_frame_t *frame );

void          x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff );

void          x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed );
void          x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed );

void          x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
x264_frame_t *x264_frame_pop( x264_frame_t **list );
void          x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
x264_frame_t *x264_frame_shift( x264_frame_t **list );
void          x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
void          x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
                              int i_width, int i_height, x264_weight_t *w );
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
void          x264_frame_delete_list( x264_frame_t **list );

int           x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int nelem );
void          x264_sync_frame_list_delete( x264_sync_frame_list_t *slist );
void          x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame );
x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist );

#endif
x264-snapshot-20120103-2245-stable/common/frame.c0000644000175000001440000007064611700673342020434 0ustar  videolanusers/*****************************************************************************
 * frame.c: frame handling
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

static int align_stride( int x, int align, int disalign )
{
    x = ALIGN( x, align );
    if( !(x&(disalign-1)) )
        x += align;
    return x;
}

static int align_plane_size( int x, int disalign )
{
    if( !(x&(disalign-1)) )
        x += 128;
    return x;
}

static int x264_frame_internal_csp( int external_csp )
{
    switch( external_csp & X264_CSP_MASK )
    {
        case X264_CSP_NV12:
        case X264_CSP_I420:
        case X264_CSP_YV12:
            return X264_CSP_NV12;
        case X264_CSP_NV16:
        case X264_CSP_I422:
        case X264_CSP_YV16:
            return X264_CSP_NV16;
        case X264_CSP_I444:
        case X264_CSP_YV24:
        case X264_CSP_BGR:
        case X264_CSP_BGRA:
        case X264_CSP_RGB:
            return X264_CSP_I444;
        default:
            return X264_CSP_NONE;
    }
}

static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
{
    x264_frame_t *frame;
    int i_csp = x264_frame_internal_csp( h->param.i_csp );
    int i_mb_count = h->mb.i_mb_count;
    int i_stride, i_width, i_lines, luma_plane_count;
    int i_padv = PADV << PARAM_INTERLACED;
    int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
    int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;

    CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );

    /* allocate frame data (+64 for extra data for me) */
    i_width  = h->mb.i_mb_width*16;
    i_lines  = h->mb.i_mb_height*16;
    i_stride = align_stride( i_width + 2*PADH, align, disalign );

    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
    {
        luma_plane_count = 1;
        frame->i_plane = 2;
        for( int i = 0; i < 2; i++ )
        {
            frame->i_width[i] = i_width >> i;
            frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
            frame->i_stride[i] = i_stride;
        }
    }
    else if( i_csp == X264_CSP_I444 )
    {
        luma_plane_count = 3;
        frame->i_plane = 3;
        for( int i = 0; i < 3; i++ )
        {
            frame->i_width[i] = i_width;
            frame->i_lines[i] = i_lines;
            frame->i_stride[i] = i_stride;
        }
    }
    else
        goto fail;

    frame->i_csp = i_csp;
    frame->i_width_lowres = frame->i_width[0]/2;
    frame->i_lines_lowres = frame->i_lines[0]/2;
    frame->i_stride_lowres = align_stride( frame->i_width_lowres + 2*PADH, align, disalign<<1 );

    for( int i = 0; i < h->param.i_bframe + 2; i++ )
        for( int j = 0; j < h->param.i_bframe + 2; j++ )
            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );

    frame->i_poc = -1;
    frame->i_type = X264_TYPE_AUTO;
    frame->i_qpplus1 = X264_QP_AUTO;
    frame->i_pts = -1;
    frame->i_frame = -1;
    frame->i_frame_num = -1;
    frame->i_lines_completed = -1;
    frame->b_fdec = b_fdec;
    frame->i_pic_struct = PIC_STRUCT_AUTO;
    frame->i_field_cnt = -1;
    frame->i_duration =
    frame->i_cpb_duration =
    frame->i_dpb_output_delay =
    frame->i_cpb_delay = 0;
    frame->i_coded_fields_lookahead =
    frame->i_cpb_delay_lookahead = -1;

    frame->orig = frame;

    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
    {
        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
        int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
        CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
        if( PARAM_INTERLACED )
        {
            CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
        }
    }

    /* all 4 luma planes allocated together, since the cacheline split code
     * requires them to be in-phase wrt cacheline alignment. */

    for( int p = 0; p < luma_plane_count; p++ )
    {
        int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
        if( h->param.analyse.i_subpel_refine && b_fdec )
        {
            /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
            CHECKED_MALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
            if( PARAM_INTERLACED )
                CHECKED_MALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
            for( int i = 0; i < 4; i++ )
            {
                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
                frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
            }
            frame->plane[p] = frame->filtered[p][0];
            frame->plane_fld[p] = frame->filtered_fld[p][0];
        }
        else
        {
            CHECKED_MALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
            if( PARAM_INTERLACED )
                CHECKED_MALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
            frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
        }
    }

    frame->b_duplicate = 0;

    if( b_fdec ) /* fdec frame */
    {
        CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
        CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
        CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
        CHECKED_MALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
        M32( frame->mv16x16[0] ) = 0;
        frame->mv16x16++;
        CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
        if( h->param.i_bframe )
        {
            CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
            CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
        }
        else
        {
            frame->mv[1]  = NULL;
            frame->ref[1] = NULL;
        }
        CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
        CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
        CHECKED_MALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
        if( h->param.analyse.i_me_method >= X264_ME_ESA )
        {
            CHECKED_MALLOC( frame->buffer[3],
                            frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
        }
        if( PARAM_INTERLACED )
            CHECKED_MALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
    }
    else /* fenc frame */
    {
        if( h->frames.b_have_lowres )
        {
            int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );

            CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
            for( int i = 0; i < 4; i++ )
                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;

            for( int j = 0; j <= !!h->param.i_bframe; j++ )
                for( int i = 0; i <= h->param.i_bframe; i++ )
                {
                    CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
                    CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
                }
            CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
            for( int j = 0; j <= h->param.i_bframe+1; j++ )
                for( int i = 0; i <= h->param.i_bframe+1; i++ )
                    CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
            frame->i_intra_cost = frame->lowres_costs[0][0];
            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
        }
        if( h->param.rc.i_aq_mode )
        {
            CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
            CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
            if( h->frames.b_have_lowres )
                /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
                CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
        }
    }

    if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
        goto fail;
    if( x264_pthread_cond_init( &frame->cv, NULL ) )
        goto fail;

    return frame;

fail:
    x264_free( frame );
    return NULL;
}

void x264_frame_delete( x264_frame_t *frame )
{
    /* Duplicate frames are blank copies of real frames (including pointers),
     * so freeing those pointers would cause a double free later. */
    if( !frame->b_duplicate )
    {
        for( int i = 0; i < 4; i++ )
        {
            x264_free( frame->buffer[i] );
            x264_free( frame->buffer_fld[i] );
        }
        for( int i = 0; i < 4; i++ )
            x264_free( frame->buffer_lowres[i] );
        for( int i = 0; i < X264_BFRAME_MAX+2; i++ )
            for( int j = 0; j < X264_BFRAME_MAX+2; j++ )
                x264_free( frame->i_row_satds[i][j] );
        for( int j = 0; j < 2; j++ )
            for( int i = 0; i <= X264_BFRAME_MAX; i++ )
            {
                x264_free( frame->lowres_mvs[j][i] );
                x264_free( frame->lowres_mv_costs[j][i] );
            }
        x264_free( frame->i_propagate_cost );
        for( int j = 0; j <= X264_BFRAME_MAX+1; j++ )
            for( int i = 0; i <= X264_BFRAME_MAX+1; i++ )
                x264_free( frame->lowres_costs[j][i] );
        x264_free( frame->f_qp_offset );
        x264_free( frame->f_qp_offset_aq );
        x264_free( frame->i_inv_qscale_factor );
        x264_free( frame->i_row_bits );
        x264_free( frame->f_row_qp );
        x264_free( frame->f_row_qscale );
        x264_free( frame->field );
        x264_free( frame->mb_type );
        x264_free( frame->mb_partition );
        x264_free( frame->mv[0] );
        x264_free( frame->mv[1] );
        if( frame->mv16x16 )
            x264_free( frame->mv16x16-1 );
        x264_free( frame->ref[0] );
        x264_free( frame->ref[1] );
        x264_pthread_mutex_destroy( &frame->mutex );
        x264_pthread_cond_destroy( &frame->cv );
    }
    x264_free( frame );
}

static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift )
{
    int width = h->param.i_width >> xshift;
    int height = h->param.i_height >> yshift;
    *pix = src->img.plane[plane];
    *stride = src->img.i_stride[plane];
    if( src->img.i_csp & X264_CSP_VFLIP )
    {
        *pix += (height-1) * *stride;
        *stride = -*stride;
    }
    if( width > abs(*stride) )
    {
        x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride );
        return -1;
    }
    return 0;
}

#define get_plane_ptr(...) do{ if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; }while(0)

int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
{
    int i_csp = src->img.i_csp & X264_CSP_MASK;
    if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
    {
        x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" );
        return -1;
    }

#if HIGH_BIT_DEPTH
    if( !(src->img.i_csp & X264_CSP_HIGH_DEPTH) )
    {
        x264_log( h, X264_LOG_ERROR, "This build of x264 requires high depth input. Rebuild to support 8-bit input.\n" );
        return -1;
    }
#else
    if( src->img.i_csp & X264_CSP_HIGH_DEPTH )
    {
        x264_log( h, X264_LOG_ERROR, "This build of x264 requires 8-bit input. Rebuild to support high depth input.\n" );
        return -1;
    }
#endif

    dst->i_type     = src->i_type;
    dst->i_qpplus1  = src->i_qpplus1;
    dst->i_pts      = dst->i_reordered_pts = src->i_pts;
    dst->param      = src->param;
    dst->i_pic_struct = src->i_pic_struct;
    dst->extra_sei  = src->extra_sei;

    uint8_t *pix[3];
    int stride[3];
    if ( i_csp >= X264_CSP_BGR )
    {
         stride[0] = src->img.i_stride[0];
         pix[0] = src->img.plane[0];
         if( src->img.i_csp & X264_CSP_VFLIP )
         {
             pix[0] += (h->param.i_height-1) * stride[0];
             stride[0] = -stride[0];
         }
         int b = i_csp==X264_CSP_RGB ? 2 : 0;
         h->mc.plane_copy_deinterleave_rgb( dst->plane[1], dst->i_stride[1],
                                            dst->plane[b], dst->i_stride[b],
                                            dst->plane[2-b], dst->i_stride[2-b],
                                            (pixel*)pix[0], stride[0]/sizeof(pixel), i_csp==X264_CSP_BGRA ? 4 : 3, h->param.i_width, h->param.i_height );
    }
    else
    {
        int v_shift = CHROMA_V_SHIFT;
        get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
        h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
                          stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height );
        if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
        {
            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
            h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
                              stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift );
        }
        else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
        {
            int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
            get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
            get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
            h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
                                         (pixel*)pix[1], stride[1]/sizeof(pixel),
                                         (pixel*)pix[2], stride[2]/sizeof(pixel),
                                         h->param.i_width>>1, h->param.i_height>>v_shift );
        }
        else //if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
        {
            get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I444 ? 1 : 2, 0, 0 );
            get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I444 ? 2 : 1, 0, 0 );
            h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
                              stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height );
            h->mc.plane_copy( dst->plane[2], dst->i_stride[2], (pixel*)pix[2],
                              stride[2]/sizeof(pixel), h->param.i_width, h->param.i_height );
        }
    }
    return 0;
}

static void ALWAYS_INLINE pixel_memset( pixel *dst, pixel *src, int len, int size )
{
    uint8_t *dstp = (uint8_t*)dst;
    uint32_t v1 = *src;
    uint32_t v2 = size == 1 ? v1 + (v1 <<  8) : M16( src );
    uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src );
    int i = 0;
    len *= size;

    /* Align the input pointer if it isn't already */
    if( (intptr_t)dstp & (WORD_SIZE - 1) )
    {
        if( size <= 2 && ((intptr_t)dstp & 3) )
        {
            if( size == 1 && ((intptr_t)dstp & 1) )
                dstp[i++] = v1;
            if( (intptr_t)dstp & 2 )
            {
                M16( dstp+i ) = v2;
                i += 2;
            }
        }
        if( WORD_SIZE == 8 && (intptr_t)dstp & 4 )
        {
            M32( dstp+i ) = v4;
            i += 4;
        }
    }

    /* Main copy loop */
    if( WORD_SIZE == 8 )
    {
        uint64_t v8 = v4 + ((uint64_t)v4<<32);
        for( ; i < len - 7; i+=8 )
            M64( dstp+i ) = v8;
    }
    for( ; i < len - 3; i+=4 )
        M32( dstp+i ) = v4;

    /* Finish up the last few bytes */
    if( size <= 2 )
    {
        if( i < len - 1 )
        {
            M16( dstp+i ) = v2;
            i += 2;
        }
        if( size == 1 && i != len )
            dstp[i] = v1;
    }
}

static void ALWAYS_INLINE plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
{
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
    for( int y = 0; y < i_height; y++ )
    {
        /* left band */
        pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
        /* right band */
        pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1-b_chroma, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
    }
    /* upper band */
    if( b_pad_top )
        for( int y = 0; y < i_padv; y++ )
            memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), (i_width+2*i_padh) * sizeof(pixel) );
    /* lower band */
    if( b_pad_bottom )
        for( int y = 0; y < i_padv; y++ )
            memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), (i_width+2*i_padh) * sizeof(pixel) );
#undef PPIXEL
}

void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
    int b_start = !mb_y;
    if( mb_y & SLICE_MBAFF )
        return;
    for( int i = 0; i < frame->i_plane; i++ )
    {
        int h_shift = i && CHROMA_H_SHIFT;
        int v_shift = i && CHROMA_V_SHIFT;
        int stride = frame->i_stride[i];
        int width = 16*h->mb.i_mb_width;
        int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
        int padh = PADH;
        int padv = PADV >> v_shift;
        // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
        if( b_end && !b_start )
            height += 4 >> (v_shift + SLICE_MBAFF);
        pixel *pix;
        if( SLICE_MBAFF )
        {
            // border samples for each field are extended separately
            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, h_shift );

            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
            if( b_end && !b_start )
                height += 4 >> v_shift;
            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
        }
        else
        {
            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
        }
    }
}

void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
    /* during filtering, 8 extra pixels were filtered on each edge,
     * but up to 3 of the horizontal ones may be wrong.
       we want to expand border from the last filtered pixel */
    int b_start = !mb_y;
    int width = 16*h->mb.i_mb_width + 8;
    int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16;
    int padh = PADH - 4;
    int padv = PADV - 8;
    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
        for( int i = 1; i < 4; i++ )
        {
            int stride = frame->i_stride[p];
            // buffer: 8 luma, to match the hpel filter
            pixel *pix;
            if( SLICE_MBAFF )
            {
                pix = frame->filtered_fld[p][i] + (16*mb_y - 16) * stride - 4;
                plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
                plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
            }

            pix = frame->filtered[p][i] + (16*mb_y - 8) * stride - 4;
            plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 );
        }
}

void x264_frame_expand_border_lowres( x264_frame_t *frame )
{
    for( int i = 0; i < 4; i++ )
        plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 );
}

void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
{
    int v_shift = CHROMA_V_SHIFT;
    plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
                         PADH, PADV>>v_shift, 1, 1, CHROMA_H_SHIFT );
}

void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
{
    for( int i = 0; i < frame->i_plane; i++ )
    {
        int i_width = h->param.i_width;
        int h_shift = i && CHROMA_H_SHIFT;
        int v_shift = i && CHROMA_V_SHIFT;
        int i_height = h->param.i_height >> v_shift;
        int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;

        if( i_padx )
        {
            for( int y = 0; y < i_height; y++ )
                pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
                              i_padx>>h_shift, sizeof(pixel)<<h_shift );
        }
        if( i_pady )
        {
            for( int y = i_height; y < i_height + i_pady; y++ )
                memcpy( &frame->plane[i][y*frame->i_stride[i]],
                        &frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)*frame->i_stride[i]],
                        (i_width + i_padx) * sizeof(pixel) );
        }
    }
}

void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
{
    for( int i = 0; i < h->fenc->i_plane; i++ )
    {
        int v_shift = i && CHROMA_V_SHIFT;
        int stride = h->fenc->i_stride[i];
        int height = h->param.i_height >> v_shift;
        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
        pixel *fenc = h->fenc->plane[i] + 16*mb_x;
        for( int y = height; y < height + pady; y++ )
            memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*sizeof(pixel) );
    }
}

/* threading */
void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
{
    x264_pthread_mutex_lock( &frame->mutex );
    frame->i_lines_completed = i_lines_completed;
    x264_pthread_cond_broadcast( &frame->cv );
    x264_pthread_mutex_unlock( &frame->mutex );
}

void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
{
    x264_pthread_mutex_lock( &frame->mutex );
    while( frame->i_lines_completed < i_lines_completed )
        x264_pthread_cond_wait( &frame->cv, &frame->mutex );
    x264_pthread_mutex_unlock( &frame->mutex );
}

/* list operators */

void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
{
    int i = 0;
    while( list[i] ) i++;
    list[i] = frame;
}

x264_frame_t *x264_frame_pop( x264_frame_t **list )
{
    x264_frame_t *frame;
    int i = 0;
    assert( list[0] );
    while( list[i+1] ) i++;
    frame = list[i];
    list[i] = NULL;
    return frame;
}

void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
{
    int i = 0;
    while( list[i] ) i++;
    while( i-- )
        list[i+1] = list[i];
    list[0] = frame;
}

x264_frame_t *x264_frame_shift( x264_frame_t **list )
{
    x264_frame_t *frame = list[0];
    int i;
    for( i = 0; list[i]; i++ )
        list[i] = list[i+1];
    assert(frame);
    return frame;
}

void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
{
    assert( frame->i_reference_count > 0 );
    frame->i_reference_count--;
    if( frame->i_reference_count == 0 )
        x264_frame_push( h->frames.unused[frame->b_fdec], frame );
}

x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
{
    x264_frame_t *frame;
    if( h->frames.unused[b_fdec][0] )
        frame = x264_frame_pop( h->frames.unused[b_fdec] );
    else
        frame = x264_frame_new( h, b_fdec );
    if( !frame )
        return NULL;
    frame->b_last_minigop_bframe = 0;
    frame->i_reference_count = 1;
    frame->b_intra_calculated = 0;
    frame->b_scenecut = 1;
    frame->b_keyframe = 0;
    frame->b_corrupt = 0;

    memset( frame->weight, 0, sizeof(frame->weight) );
    memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );

    return frame;
}

void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
{
    assert( frame->i_reference_count > 0 );
    frame->i_reference_count--;
    if( frame->i_reference_count == 0 )
        x264_frame_push( h->frames.blank_unused, frame );
}

x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
{
    x264_frame_t *frame;
    if( h->frames.blank_unused[0] )
        frame = x264_frame_pop( h->frames.blank_unused );
    else
        frame = x264_malloc( sizeof(x264_frame_t) );
    if( !frame )
        return NULL;
    frame->b_duplicate = 1;
    frame->i_reference_count = 1;
    return frame;
}

void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
                         int i_width, int i_height, x264_weight_t *w )
{
    /* Weight horizontal strips of height 16. This was found to be the optimal height
     * in terms of the cache loads. */
    while( i_height > 0 )
    {
        int x;
        for( x = 0; x < i_width-8; x += 16 )
            w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
        if( x < i_width )
            w->weightfn[ 8>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
        i_height -= 16;
        dst += 16 * i_dst_stride;
        src += 16 * i_src_stride;
    }
}

void x264_frame_delete_list( x264_frame_t **list )
{
    int i = 0;
    if( !list )
        return;
    while( list[i] )
        x264_frame_delete( list[i++] );
    x264_free( list );
}

int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int max_size )
{
    if( max_size < 0 )
        return -1;
    slist->i_max_size = max_size;
    slist->i_size = 0;
    CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
    if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
        x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
        x264_pthread_cond_init( &slist->cv_empty, NULL ) )
        return -1;
    return 0;
fail:
    return -1;
}

void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist )
{
    x264_pthread_mutex_destroy( &slist->mutex );
    x264_pthread_cond_destroy( &slist->cv_fill );
    x264_pthread_cond_destroy( &slist->cv_empty );
    x264_frame_delete_list( slist->list );
}

void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame )
{
    x264_pthread_mutex_lock( &slist->mutex );
    while( slist->i_size == slist->i_max_size )
        x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
    slist->list[ slist->i_size++ ] = frame;
    x264_pthread_mutex_unlock( &slist->mutex );
    x264_pthread_cond_broadcast( &slist->cv_fill );
}

x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist )
{
    x264_frame_t *frame;
    x264_pthread_mutex_lock( &slist->mutex );
    while( !slist->i_size )
        x264_pthread_cond_wait( &slist->cv_fill, &slist->mutex );
    frame = slist->list[ --slist->i_size ];
    slist->list[ slist->i_size ] = NULL;
    x264_pthread_cond_broadcast( &slist->cv_empty );
    x264_pthread_mutex_unlock( &slist->mutex );
    return frame;
}
x264-snapshot-20120103-2245-stable/common/deblock.c0000644000175000001440000010635511700673342020742 0ustar  videolanusers/*****************************************************************************
 * deblock.c: deblocking
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Henrik Gramner <hengar-6@student.ltu.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

/* Deblocking filter */
static const uint8_t i_alpha_table[52+12*3] =
{
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
    80, 90,101,113,127,144,162,182,203,226,
   255,255,
   255,255,255,255,255,255,255,255,255,255,255,255,
};
static const uint8_t i_beta_table[52+12*3] =
{
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
    18, 18,
    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
};
static const int8_t i_tc0_table[52+12*3][4] =
{
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
};
#define alpha_table(x) i_alpha_table[(x)+24]
#define beta_table(x)  i_beta_table[(x)+24]
#define tc0_table(x)   i_tc0_table[(x)+24]

/* From ffmpeg */
static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc0 )
{
    int p2 = pix[-3*xstride];
    int p1 = pix[-2*xstride];
    int p0 = pix[-1*xstride];
    int q0 = pix[ 0*xstride];
    int q1 = pix[ 1*xstride];
    int q2 = pix[ 2*xstride];

    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
    {
        int tc = tc0;
        int delta;
        if( abs( p2 - p0 ) < beta )
        {
            if( tc0 )
                pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0, tc0 );
            tc++;
        }
        if( abs( q2 - q0 ) < beta )
        {
            if( tc0 )
                pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0, tc0 );
            tc++;
        }

        delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
        pix[-1*xstride] = x264_clip_pixel( p0 + delta );    /* p0' */
        pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
    }
}
static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
    for( int i = 0; i < 4; i++ )
    {
        if( tc0[i] < 0 )
        {
            pix += 4*ystride;
            continue;
        }
        for( int d = 0; d < 4; d++, pix += ystride )
            deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
    }
}
static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    for( int d = 0; d < 8; d++, pix += stride )
        deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
}
static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
}
static void deblock_h_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
}

static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc )
{
    int p1 = pix[-2*xstride];
    int p0 = pix[-1*xstride];
    int q0 = pix[ 0*xstride];
    int q1 = pix[ 1*xstride];

    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
    {
        int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
        pix[-1*xstride] = x264_clip_pixel( p0 + delta );    /* p0' */
        pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
    }
}
static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
    for( int i = 0; i < 4; i++ )
    {
        int tc = tc0[i];
        if( tc <= 0 )
        {
            pix += height*ystride;
            continue;
        }
        for( int d = 0; d < height; d++, pix += ystride-2 )
            for( int e = 0; e < 2; e++, pix++ )
                deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
    }
}
static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    for( int i = 0; i < 4; i++, pix += stride )
        deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] );
}
static void deblock_h_chroma_422_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    for( int i = 0; i < 8; i++, pix += stride )
        deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i>>1] );
}
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
}
static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
}
static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
}

static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta )
{
    int p2 = pix[-3*xstride];
    int p1 = pix[-2*xstride];
    int p0 = pix[-1*xstride];
    int q0 = pix[ 0*xstride];
    int q1 = pix[ 1*xstride];
    int q2 = pix[ 2*xstride];

    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
    {
        if( abs( p0 - q0 ) < ((alpha >> 2) + 2) )
        {
            if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
            {
                const int p3 = pix[-4*xstride];
                pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
                pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
                pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
            }
            else /* p0' */
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
            if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
            {
                const int q3 = pix[3*xstride];
                pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
                pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
                pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
            }
            else /* q0' */
                pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
        }
        else /* p0', q0' */
        {
            pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
            pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
        }
    }
}
static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
{
    for( int d = 0; d < 16; d++, pix += ystride )
        deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
}
static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
{
    for( int d = 0; d < 8; d++, pix += ystride )
        deblock_edge_luma_intra_c( pix, 1, alpha, beta );
}
static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
    deblock_luma_intra_c( pix, stride, 1, alpha, beta );
}
static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
    deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}

static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, int alpha, int beta )
{
    int p1 = pix[-2*xstride];
    int p0 = pix[-1*xstride];
    int q0 = pix[ 0*xstride];
    int q1 = pix[ 1*xstride];

    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
    {
        pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
        pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
    }
}
static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta )
{
    for( int d = 0; d < height; d++, pix += ystride-2 )
        for( int e = 0; e < width; e++, pix++ )
            deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
}
static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
{
    for( int i = 0; i < 4; i++, pix += stride )
        deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
}
static void deblock_h_chroma_422_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
{
    for( int i = 0; i < 8; i++, pix += stride )
        deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
}
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
    deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
}
static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
    deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
}
static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta )
{
    deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
}

static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
                                int bframe )
{
    for( int dir = 0; dir < 2; dir++ )
    {
        int s1 = dir ? 1 : 8;
        int s2 = dir ? 8 : 1;
        for( int edge = 0; edge < 4; edge++ )
            for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
            {
                int locn = loc - s2;
                if( nnz[loc] || nnz[locn] )
                    bs[dir][edge][i] = 2;
                else if( ref[0][loc] != ref[0][locn] ||
                         abs( mv[0][loc][0] - mv[0][locn][0] ) >= 4 ||
                         abs( mv[0][loc][1] - mv[0][locn][1] ) >= mvy_limit ||
                        (bframe && (ref[1][loc] != ref[1][locn] ||
                         abs( mv[1][loc][0] - mv[1][locn][0] ) >= 4 ||
                         abs( mv[1][loc][1] - mv[1][locn][1] ) >= mvy_limit )))
                {
                    bs[dir][edge][i] = 1;
                }
                else
                    bs[dir][edge][i] = 0;
            }
    }
}

static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
{
    int index_a = i_qp + a;
    int index_b = i_qp + b;
    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
    int beta  = beta_table(index_b) << (BIT_DEPTH-8);
    int8_t tc[4];

    if( !M32(bS) || !alpha || !beta )
        return;

    tc[0] = (tc0_table(index_a)[bS[0]] << (BIT_DEPTH-8)) + b_chroma;
    tc[1] = (tc0_table(index_a)[bS[1]] << (BIT_DEPTH-8)) + b_chroma;
    tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma;
    tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma;

    pf_inter( pix, i_stride, alpha, beta, tc );
}

static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
{
    int index_a = i_qp + a;
    int index_b = i_qp + b;
    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
    int beta  = beta_table(index_b) << (BIT_DEPTH-8);

    if( !alpha || !beta )
        return;

    pf_intra( pix, i_stride, alpha, beta );
}

static ALWAYS_INLINE void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
{
    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;

    h->mb.i_neighbour = 0;
    h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
    h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy];
    h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED);
    h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y;
    h->mb.i_mb_left_xy[1] =
    h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
    if( SLICE_MBAFF )
    {
        if( mb_y&1 )
        {
            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
                h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
        }
        else
        {
            if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] )
            {
                h->mb.i_mb_top_xy += h->mb.i_mb_stride;
                h->mb.i_mb_top_y++;
            }
            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
                h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
        }
    }

    if( mb_x > 0 && (deblock_on_slice_edges ||
        h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) )
        h->mb.i_neighbour |= MB_LEFT;
    if( mb_y > MB_INTERLACED && (deblock_on_slice_edges
        || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) )
        h->mb.i_neighbour |= MB_TOP;
}

void x264_frame_deblock_row( x264_t *h, int mb_y )
{
    int b_interlaced = SLICE_MBAFF;
    int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
    int b = h->sh.i_beta_offset - QP_BD_OFFSET;
    int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
    int stridey   = h->fdec->i_stride[0];
    int strideuv  = h->fdec->i_stride[1];
    int chroma444 = CHROMA444;
    int chroma_height = 16 >> CHROMA_V_SHIFT;
    intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;

    for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
    {
        x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
        x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );

        int mb_xy = h->mb.i_mb_xy;
        int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy];
        int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
        uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x];

        pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
        pixel *pixuv = h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x;

        if( mb_y & MB_INTERLACED )
        {
            pixy -= 15*stridey;
            pixuv -= (chroma_height-1)*strideuv;
        }

        int stride2y  = stridey << MB_INTERLACED;
        int stride2uv = strideuv << MB_INTERLACED;
        int qp = h->mb.qp[mb_xy];
        int qpc = h->chroma_qp_table[qp];
        int first_edge_only = (h->mb.partition[mb_xy] == D_16x16 && !h->mb.cbp[mb_xy] && !intra_cur) || qp <= qp_thresh;

        #define FILTER( intra, dir, edge, qp, chroma_qp )\
        do\
        {\
            if( !(edge & 1) || !transform_8x8 )\
            {\
                deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
                                     stride2y, bs[dir][edge], qp, a, b, 0,\
                                     h->loopf.deblock_luma##intra[dir] );\
                if( CHROMA_FORMAT == CHROMA_444 )\
                {\
                    deblock_edge##intra( h, pixuv          + 4*edge*(dir?stride2uv:1),\
                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
                                         h->loopf.deblock_luma##intra[dir] );\
                    deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
                                         h->loopf.deblock_luma##intra[dir] );\
                }\
                else if( CHROMA_FORMAT == CHROMA_420 && !(edge & 1) )\
                {\
                    deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\
                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
                                         h->loopf.deblock_chroma##intra[dir] );\
                }\
            }\
            if( CHROMA_FORMAT == CHROMA_422 && (dir || !(edge & 1)) )\
            {\
                deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\
                                     stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
                                     h->loopf.deblock_chroma##intra[dir] );\
            }\
        } while(0)

        if( h->mb.i_neighbour & MB_LEFT )
        {
            if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
            {
                int luma_qp[2];
                int chroma_qp[2];
                int left_qp[2];
                x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff;
                x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff;
                x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff;
                x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff;
                int c = chroma444 ? 0 : 1;

                left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
                luma_qp[0] = (qp + left_qp[0] + 1) >> 1;
                chroma_qp[0] = (qpc + h->chroma_qp_table[left_qp[0]] + 1) >> 1;
                if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] ) )
                {
                    deblock_edge_intra( h, pixy,           2*stridey,  bs[0][0], luma_qp[0],   a, b, 0, luma_intra_deblock );
                    deblock_edge_intra( h, pixuv,          2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
                    deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
                }
                else
                {
                    deblock_edge( h, pixy,           2*stridey,  bs[0][0], luma_qp[0],   a, b, 0, luma_deblock );
                    deblock_edge( h, pixuv,          2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
                    deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
                }

                int offy = MB_INTERLACED ? 4 : 0;
                int offuv = MB_INTERLACED ? 4-CHROMA_V_SHIFT : 0;
                left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
                luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
                chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
                if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[1]] ) )
                {
                    deblock_edge_intra( h, pixy           + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   a, b, 0, luma_intra_deblock );
                    deblock_edge_intra( h, pixuv          + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
                    deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
                }
                else
                {
                    deblock_edge( h, pixy           + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   a, b, 0, luma_deblock );
                    deblock_edge( h, pixuv          + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
                    deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
                }
            }
            else
            {
                int qpl = h->mb.qp[h->mb.i_mb_xy-1];
                int qp_left = (qp + qpl + 1) >> 1;
                int qpc_left = (qpc + h->chroma_qp_table[qpl] + 1) >> 1;
                int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] );

                if( intra_cur || intra_left )
                    FILTER( _intra, 0, 0, qp_left, qpc_left );
                else
                    FILTER(       , 0, 0, qp_left, qpc_left );
            }
        }
        if( !first_edge_only )
        {
            FILTER( , 0, 1, qp, qpc );
            FILTER( , 0, 2, qp, qpc );
            FILTER( , 0, 3, qp, qpc );
        }

        if( h->mb.i_neighbour & MB_TOP )
        {
            if( b_interlaced && !(mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] )
            {
                int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride;

                for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
                {
                    int qpt = h->mb.qp[mbn_xy];
                    int qp_top = (qp + qpt + 1) >> 1;
                    int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
                    int intra_top = IS_INTRA( h->mb.type[mbn_xy] );
                    if( intra_cur || intra_top )
                        M32( bs[1][4*j] ) = 0x03030303;

                    // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows
                    deblock_edge( h, pixy      + j*stridey,  2* stridey, bs[1][4*j], qp_top, a, b, 0, h->loopf.deblock_luma[1] );
                    if( chroma444 )
                    {
                        deblock_edge( h, pixuv          + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
                        deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
                    }
                    else
                        deblock_edge( h, pixuv          + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 1, h->loopf.deblock_chroma[1] );
                }
            }
            else
            {
                int qpt = h->mb.qp[h->mb.i_mb_top_xy];
                int qp_top = (qp + qpt + 1) >> 1;
                int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
                int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );

                if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy]))
                    && (intra_cur || intra_top) )
                {
                    FILTER( _intra, 1, 0, qp_top, qpc_top );
                }
                else
                {
                    if( intra_cur || intra_top )
                        M32( bs[1][0] ) = 0x03030303;
                    FILTER(       , 1, 0, qp_top, qpc_top );
                }
            }
        }

        if( !first_edge_only )
        {
            FILTER( , 1, 1, qp, qpc );
            FILTER( , 1, 2, qp, qpc );
            FILTER( , 1, 3, qp, qpc );
        }

        #undef FILTER
    }
}

/* For deblock-aware RD.
 * TODO:
 *  deblock macroblock edges
 *  support analysis partitions smaller than 16x16
 *  deblock chroma for 4:2:0/4:2:2
 *  handle duplicate refs correctly
 *  handle cavlc+8x8dct correctly
 */
void x264_macroblock_deblock( x264_t *h )
{
    int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
    int b = h->sh.i_beta_offset - QP_BD_OFFSET;
    int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
    int intra_cur = IS_INTRA( h->mb.i_type );
    int qp = h->mb.i_qp;
    int qpc = h->mb.i_chroma_qp;
    if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh )
        return;

    uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
    if( intra_cur )
    {
        memset( &bs[0][1], 3, 3*4*sizeof(uint8_t) );
        memset( &bs[1][1], 3, 3*4*sizeof(uint8_t) );
    }
    else
        h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
                                   bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B );

    int transform_8x8 = h->mb.b_transform_8x8;

    #define FILTER( dir, edge )\
    do\
    {\
        deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\
                      FDEC_STRIDE, bs[dir][edge], qp, a, b, 0,\
                      h->loopf.deblock_luma[dir] );\
        if( CHROMA444 )\
        {\
            deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\
                          FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
                          h->loopf.deblock_luma[dir] );\
            deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\
                          FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
                          h->loopf.deblock_luma[dir] );\
        }\
    } while(0)

    if( !transform_8x8 ) FILTER( 0, 1 );
                         FILTER( 0, 2 );
    if( !transform_8x8 ) FILTER( 0, 3 );

    if( !transform_8x8 ) FILTER( 1, 1 );
                         FILTER( 1, 2 );
    if( !transform_8x8 ) FILTER( 1, 3 );

    #undef FILTER
}

#if HAVE_MMX
void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
void x264_deblock_strength_avx  ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
#if ARCH_X86
void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );

#if HIGH_BIT_DEPTH
void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
#else
// FIXME this wrapper has a significant cpu cost
static void x264_deblock_v_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    x264_deblock_v8_luma_mmx2( pix,   stride, alpha, beta, tc0   );
    x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
}
static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta )
{
    x264_deblock_v8_luma_intra_mmx2( pix,   stride, alpha, beta );
    x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
}
#endif // HIGH_BIT_DEPTH
#endif
#endif

#if ARCH_PPC
void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
#endif // ARCH_PPC

#if HAVE_ARMV6
void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
#endif

void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
{
    pf->deblock_luma[1] = deblock_v_luma_c;
    pf->deblock_luma[0] = deblock_h_luma_c;
    pf->deblock_chroma[1] = deblock_v_chroma_c;
    pf->deblock_h_chroma_420 = deblock_h_chroma_c;
    pf->deblock_h_chroma_422 = deblock_h_chroma_422_c;
    pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
    pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
    pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
    pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c;
    pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
    pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
    pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
    pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c;
    pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
    pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
    pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c;
    pf->deblock_strength = deblock_strength_c;

#if HAVE_MMX
    if( cpu&X264_CPU_MMX2 )
    {
#if ARCH_X86
        pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
        pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
        pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
#if !HIGH_BIT_DEPTH
        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
#endif
        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
#endif
        pf->deblock_strength = x264_deblock_strength_mmx2;
        if( cpu&X264_CPU_SSE2 )
        {
            pf->deblock_strength = x264_deblock_strength_sse2;
            pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
#if !HIGH_BIT_DEPTH
            pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
#endif
            if( !(cpu&X264_CPU_STACK_MOD4) )
            {
                pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
                pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
                pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
                pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
                pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
                pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
            }
        }
        if( cpu&X264_CPU_SSSE3 )
            pf->deblock_strength = x264_deblock_strength_ssse3;
        if( cpu&X264_CPU_AVX )
        {
            pf->deblock_strength = x264_deblock_strength_avx;
            pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
#if !HIGH_BIT_DEPTH
            pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
#endif
            if( !(cpu&X264_CPU_STACK_MOD4) )
            {
                pf->deblock_luma[1] = x264_deblock_v_luma_avx;
                pf->deblock_luma[0] = x264_deblock_h_luma_avx;
                pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
                pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
                pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
                pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
            }
        }
    }
#endif

#if !HIGH_BIT_DEPTH
#if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
    {
        pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
        pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
   }
#endif // HAVE_ALTIVEC

#if HAVE_ARMV6
   if( cpu&X264_CPU_NEON )
   {
        pf->deblock_luma[1] = x264_deblock_v_luma_neon;
        pf->deblock_luma[0] = x264_deblock_h_luma_neon;
        pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
   }
#endif
#endif // !HIGH_BIT_DEPTH
}
x264-snapshot-20120103-2245-stable/common/common.h0000644000175000001440000007702211700673342020632 0ustar  videolanusers/*****************************************************************************
 * common.h: misc common functions
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_COMMON_H
#define X264_COMMON_H

/****************************************************************************
 * Macros
 ****************************************************************************/
#define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) )
#define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) )
#define X264_MIN3(a,b,c) X264_MIN((a),X264_MIN((b),(c)))
#define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c)))
#define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
#define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
#define XCHG(type,a,b) do{ type t = a; a = b; b = t; } while(0)
#define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
#define FIX8(f) ((int)(f*(1<<8)+.5))
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))

#define CHECKED_MALLOC( var, size )\
do {\
    var = x264_malloc( size );\
    if( !var )\
        goto fail;\
} while( 0 )
#define CHECKED_MALLOCZERO( var, size )\
do {\
    CHECKED_MALLOC( var, size );\
    memset( var, 0, size );\
} while( 0 )

#define X264_BFRAME_MAX 16
#define X264_REF_MAX 16
#define X264_THREAD_MAX 128
#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
#define X264_LOOKAHEAD_MAX 250
#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
#define QP_MAX_SPEC (51+QP_BD_OFFSET)
#define QP_MAX (QP_MAX_SPEC+18)
#define QP_MAX_MAX (51+2*6+18)
#define PIXEL_MAX ((1 << BIT_DEPTH)-1)
// arbitrary, but low because SATD scores are 1/4 normal
#define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
#define SPEC_QP(x) X264_MIN((x), QP_MAX_SPEC)

// number of pixels (per thread) in progress at any given time.
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
#define X264_THREAD_HEIGHT 24

/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
 * (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
 * to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
 * real weights are being used. */

#define X264_WEIGHTP_FAKE (-1)

#define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
#define FILLER_OVERHEAD (NALU_OVERHEAD+1)

/****************************************************************************
 * Includes
 ****************************************************************************/
#include "osdep.h"
#include <stdarg.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <limits.h>

#if HAVE_INTERLACED
#   define MB_INTERLACED h->mb.b_interlaced
#   define SLICE_MBAFF h->sh.b_mbaff
#   define PARAM_INTERLACED h->param.b_interlaced
#else
#   define MB_INTERLACED 0
#   define SLICE_MBAFF 0
#   define PARAM_INTERLACED 0
#endif

#ifdef CHROMA_FORMAT
#    define CHROMA_H_SHIFT (CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422)
#    define CHROMA_V_SHIFT (CHROMA_FORMAT == CHROMA_420)
#else
#    define CHROMA_FORMAT h->sps->i_chroma_format_idc
#    define CHROMA_H_SHIFT h->mb.chroma_h_shift
#    define CHROMA_V_SHIFT h->mb.chroma_v_shift
#endif

#define CHROMA_SIZE(s) ((s)>>(CHROMA_H_SHIFT+CHROMA_V_SHIFT))
#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)

/* Unions for type-punning.
 * Mn: load or store n bits, aligned, native-endian
 * CPn: copy n bits, aligned, native-endian
 * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
typedef union { uint16_t i; uint8_t  c[2]; } MAY_ALIAS x264_union16_t;
typedef union { uint32_t i; uint16_t b[2]; uint8_t  c[4]; } MAY_ALIAS x264_union32_t;
typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
typedef struct { uint64_t i[2]; } x264_uint128_t;
typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
#define M16(src) (((x264_union16_t*)(src))->i)
#define M32(src) (((x264_union32_t*)(src))->i)
#define M64(src) (((x264_union64_t*)(src))->i)
#define M128(src) (((x264_union128_t*)(src))->i)
#define M128_ZERO ((x264_uint128_t){{0,0}})
#define CP16(dst,src) M16(dst) = M16(src)
#define CP32(dst,src) M32(dst) = M32(src)
#define CP64(dst,src) M64(dst) = M64(src)
#define CP128(dst,src) M128(dst) = M128(src)

#if HIGH_BIT_DEPTH
    typedef uint16_t pixel;
    typedef uint64_t pixel4;
    typedef int32_t  dctcoef;
    typedef uint32_t udctcoef;

#   define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
#   define MPIXEL_X4(src) M64(src)
#else
    typedef uint8_t  pixel;
    typedef uint32_t pixel4;
    typedef int16_t  dctcoef;
    typedef uint16_t udctcoef;

#   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
#   define MPIXEL_X4(src) M32(src)
#endif

#define BIT_DEPTH X264_BIT_DEPTH

#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)

#define X264_SCAN8_LUMA_SIZE (5*8)
#define X264_SCAN8_SIZE (X264_SCAN8_LUMA_SIZE*3)
#define X264_SCAN8_0 (4+1*8)

/* Scan8 organization:
 *    0 1 2 3 4 5 6 7
 * 0  DY    y y y y y
 * 1        y Y Y Y Y
 * 2        y Y Y Y Y
 * 3        y Y Y Y Y
 * 4        y Y Y Y Y
 * 5  DU    u u u u u
 * 6        u U U U U
 * 7        u U U U U
 * 8        u U U U U
 * 9        u U U U U
 * 10 DV    v v v v v
 * 11       v V V V V
 * 12       v V V V V
 * 13       v V V V V
 * 14       v V V V V
 * DY/DU/DV are for luma/chroma DC.
 */

#define LUMA_DC   48
#define CHROMA_DC 49

static const uint8_t x264_scan8[16*3 + 3] =
{
    4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
    6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
    4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
    6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
    4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
    6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
    4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
    6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
    4+11*8, 5+11*8, 4+12*8, 5+12*8,
    6+11*8, 7+11*8, 6+12*8, 7+12*8,
    4+13*8, 5+13*8, 4+14*8, 5+14*8,
    6+13*8, 7+13*8, 6+14*8, 7+14*8,
    0+ 0*8, 0+ 5*8, 0+10*8
};

#include "x264.h"
#include "bitstream.h"
#include "set.h"
#include "predict.h"
#include "pixel.h"
#include "mc.h"
#include "frame.h"
#include "dct.h"
#include "cabac.h"
#include "quant.h"
#include "cpu.h"
#include "threadpool.h"

/****************************************************************************
 * General functions
 ****************************************************************************/
/* x264_malloc : will do or emulate a memalign
 * you have to use x264_free for buffers allocated with x264_malloc */
void *x264_malloc( int );
void  x264_free( void * );

/* x264_slurp_file: malloc space for the whole file and read it */
char *x264_slurp_file( const char *filename );

/* mdate: return the current date in microsecond */
int64_t x264_mdate( void );

/* x264_param2string: return a (malloced) string containing most of
 * the encoding options */
char *x264_param2string( x264_param_t *p, int b_res );

/* log */
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );

void x264_reduce_fraction( uint32_t *n, uint32_t *d );
void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
void x264_cavlc_init( void );
void x264_cabac_init( x264_t *h );

static ALWAYS_INLINE pixel x264_clip_pixel( int x )
{
    return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
}

static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
{
    return ( (v < i_min) ? i_min : (v > i_max) ? i_max : v );
}

static ALWAYS_INLINE double x264_clip3f( double v, double f_min, double f_max )
{
    return ( (v < f_min) ? f_min : (v > f_max) ? f_max : v );
}

static ALWAYS_INLINE int x264_median( int a, int b, int c )
{
    int t = (a-b)&((a-b)>>31);
    a -= t;
    b += t;
    b -= (b-c)&((b-c)>>31);
    b += (a-b)&((a-b)>>31);
    return b;
}

static ALWAYS_INLINE void x264_median_mv( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
{
    dst[0] = x264_median( a[0], b[0], c[0] );
    dst[1] = x264_median( a[1], b[1], c[1] );
}

static ALWAYS_INLINE int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
{
    int sum = 0;
    for( int i = 0; i < i_mvc-1; i++ )
    {
        sum += abs( mvc[i][0] - mvc[i+1][0] )
             + abs( mvc[i][1] - mvc[i+1][1] );
    }
    return sum;
}

static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
{
    int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]);
    int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]);
    amvd0 = (amvd0 > 2) + (amvd0 > 32);
    amvd1 = (amvd1 > 2) + (amvd1 > 32);
    return amvd0 + (amvd1<<8);
}

static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
{
    for( int i = 0; i < i_mvc; i++ )
    {
        int mx = (mvc[i][0] + 2) >> 2;
        int my = (mvc[i][1] + 2) >> 2;
        dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
        dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
    }
}

extern const uint8_t x264_exp2_lut[64];
extern const float x264_log2_lut[128];
extern const float x264_log2_lz_lut[32];

/* Not a general-purpose function; multiplies input by -1/6 to convert
 * qp to qscale. */
static ALWAYS_INLINE int x264_exp2fix8( float x )
{
    int i = x*(-64.f/6.f) + 512.5f;
    if( i < 0 ) return 0;
    if( i > 1023 ) return 0xffff;
    return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8;
}

static ALWAYS_INLINE float x264_log2( uint32_t x )
{
    int lz = x264_clz( x );
    return x264_log2_lut[(x<<lz>>24)&0x7f] + x264_log2_lz_lut[lz];
}

/****************************************************************************
 *
 ****************************************************************************/
enum slice_type_e
{
    SLICE_TYPE_P  = 0,
    SLICE_TYPE_B  = 1,
    SLICE_TYPE_I  = 2,
};

static const char slice_type_to_char[] = { 'P', 'B', 'I' };

enum sei_payload_type_e
{
    SEI_BUFFERING_PERIOD       = 0,
    SEI_PIC_TIMING             = 1,
    SEI_PAN_SCAN_RECT          = 2,
    SEI_FILLER                 = 3,
    SEI_USER_DATA_REGISTERED   = 4,
    SEI_USER_DATA_UNREGISTERED = 5,
    SEI_RECOVERY_POINT         = 6,
    SEI_DEC_REF_PIC_MARKING    = 7,
    SEI_FRAME_PACKING          = 45,
};

typedef struct
{
    x264_sps_t *sps;
    x264_pps_t *pps;

    int i_type;
    int i_first_mb;
    int i_last_mb;

    int i_pps_id;

    int i_frame_num;

    int b_mbaff;
    int b_field_pic;
    int b_bottom_field;

    int i_idr_pic_id;   /* -1 if nal_type != 5 */

    int i_poc;
    int i_delta_poc_bottom;

    int i_delta_poc[2];
    int i_redundant_pic_cnt;

    int b_direct_spatial_mv_pred;

    int b_num_ref_idx_override;
    int i_num_ref_idx_l0_active;
    int i_num_ref_idx_l1_active;

    int b_ref_pic_list_reordering[2];
    struct
    {
        int idc;
        int arg;
    } ref_pic_list_order[2][X264_REF_MAX];

    /* P-frame weighting */
    x264_weight_t weight[X264_REF_MAX*2][3];

    int i_mmco_remove_from_end;
    int i_mmco_command_count;
    struct /* struct for future expansion */
    {
        int i_difference_of_pic_nums;
        int i_poc;
    } mmco[X264_REF_MAX];

    int i_cabac_init_idc;

    int i_qp;
    int i_qp_delta;
    int b_sp_for_swidth;
    int i_qs_delta;

    /* deblocking filter */
    int i_disable_deblocking_filter_idc;
    int i_alpha_c0_offset;
    int i_beta_offset;

} x264_slice_header_t;

typedef struct x264_lookahead_t
{
    volatile uint8_t              b_exit_thread;
    uint8_t                       b_thread_active;
    uint8_t                       b_analyse_keyframe;
    int                           i_last_keyframe;
    int                           i_slicetype_length;
    x264_frame_t                  *last_nonb;
    x264_pthread_t                thread_handle;
    x264_sync_frame_list_t        ifbuf;
    x264_sync_frame_list_t        next;
    x264_sync_frame_list_t        ofbuf;
} x264_lookahead_t;

typedef struct x264_ratecontrol_t   x264_ratecontrol_t;

typedef struct x264_left_table_t
{
    uint8_t intra[4];
    uint8_t nnz[4];
    uint8_t nnz_chroma[4];
    uint8_t mv[4];
    uint8_t ref[4];
} x264_left_table_t;

struct x264_t
{
    /* encoder parameters */
    x264_param_t    param;

    x264_t          *thread[X264_THREAD_MAX+1];
    int             b_thread_active;
    int             i_thread_phase; /* which thread to use for the next frame */
    int             i_threadslice_start; /* first row in this thread slice */
    int             i_threadslice_end; /* row after the end of this thread slice */
    x264_threadpool_t *threadpool;

    /* bitstream output */
    struct
    {
        int         i_nal;
        int         i_nals_allocated;
        x264_nal_t  *nal;
        int         i_bitstream;    /* size of p_bitstream */
        uint8_t     *p_bitstream;   /* will hold data for all nal */
        bs_t        bs;
    } out;

    uint8_t *nal_buffer;
    int      nal_buffer_size;

    /**** thread synchronization starts here ****/

    /* frame number/poc */
    int             i_frame;
    int             i_frame_num;

    int             i_thread_frames; /* Number of different frames being encoded by threads;
                                      * 1 when sliced-threads is on. */
    int             i_nal_type;
    int             i_nal_ref_idc;

    int64_t         i_disp_fields;  /* Number of displayed fields (both coded and implied via pic_struct) */
    int             i_disp_fields_last_frame;
    int64_t         i_prev_duration; /* Duration of previous frame */
    int64_t         i_coded_fields; /* Number of coded fields (both coded and implied via pic_struct) */
    int64_t         i_cpb_delay;    /* Equal to number of fields preceding this field
                                     * since last buffering_period SEI */
    int64_t         i_coded_fields_lookahead; /* Use separate counters for lookahead */
    int64_t         i_cpb_delay_lookahead;

    int64_t         i_cpb_delay_pir_offset;

    int             b_queued_intra_refresh;
    int64_t         i_last_idr_pts;

    int             i_idr_pic_id;

    /* quantization matrix for decoding, [cqm][qp%6][coef] */
    int             (*dequant4_mf[4])[16];   /* [4][6][16] */
    int             (*dequant8_mf[4])[64];   /* [4][6][64] */
    /* quantization matrix for trellis, [cqm][qp][coef] */
    int             (*unquant4_mf[4])[16];   /* [4][52][16] */
    int             (*unquant8_mf[4])[64];   /* [4][52][64] */
    /* quantization matrix for deadzone */
    udctcoef        (*quant4_mf[4])[16];     /* [4][52][16] */
    udctcoef        (*quant8_mf[4])[64];     /* [4][52][64] */
    udctcoef        (*quant4_bias[4])[16];   /* [4][52][16] */
    udctcoef        (*quant8_bias[4])[64];   /* [4][52][64] */
    udctcoef        (*nr_offset_emergency)[4][64];

    /* mv/ref cost arrays. */
    uint16_t *cost_mv[QP_MAX+1];
    uint16_t *cost_mv_fpel[QP_MAX+1][4];

    const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */

    /* Slice header */
    x264_slice_header_t sh;

    /* SPS / PPS */
    x264_sps_t      sps[1];
    x264_pps_t      pps[1];

    /* Slice header backup, for SEI_DEC_REF_PIC_MARKING */
    int b_sh_backup;
    x264_slice_header_t sh_backup;

    /* cabac context */
    x264_cabac_t    cabac;

    struct
    {
        /* Frames to be encoded (whose types have been decided) */
        x264_frame_t **current;
        /* Unused frames: 0 = fenc, 1 = fdec */
        x264_frame_t **unused[2];

        /* Unused blank frames (for duplicates) */
        x264_frame_t **blank_unused;

        /* frames used for reference + sentinels */
        x264_frame_t *reference[X264_REF_MAX+2];

        int i_last_keyframe;       /* Frame number of the last keyframe */
        int i_last_idr;            /* Frame number of the last IDR (not RP)*/
        int i_poc_last_open_gop;   /* Poc of the I frame of the last open-gop. The value
                                    * is only assigned during the period between that
                                    * I frame and the next P or I frame, else -1 */

        int i_input;    /* Number of input frames already accepted */

        int i_max_dpb;  /* Number of frames allocated in the decoded picture buffer */
        int i_max_ref0;
        int i_max_ref1;
        int i_delay;    /* Number of frames buffered for B reordering */
        int     i_bframe_delay;
        int64_t i_bframe_delay_time;
        int64_t i_first_pts;
        int64_t i_prev_reordered_pts[2];
        int64_t i_largest_pts;
        int64_t i_second_largest_pts;
        int b_have_lowres;  /* Whether 1/2 resolution luma planes are being used */
        int b_have_sub8x8_esa;
    } frames;

    /* current frame being encoded */
    x264_frame_t    *fenc;

    /* frame being reconstructed */
    x264_frame_t    *fdec;

    /* references lists */
    int             i_ref[2];
    x264_frame_t    *fref[2][X264_REF_MAX+3];
    x264_frame_t    *fref_nearest[2];
    int             b_ref_reorder[2];

    /* hrd */
    int initial_cpb_removal_delay;
    int initial_cpb_removal_delay_offset;
    int64_t i_reordered_pts_delay;

    /* Current MB DCT coeffs */
    struct
    {
        ALIGNED_16( dctcoef luma16x16_dc[3][16] );
        ALIGNED_16( dctcoef chroma_dc[2][8] );
        // FIXME share memory?
        ALIGNED_16( dctcoef luma8x8[12][64] );
        ALIGNED_16( dctcoef luma4x4[16*3][16] );
    } dct;

    /* MB table and cache for current frame/mb */
    struct
    {
        int     i_mb_width;
        int     i_mb_height;
        int     i_mb_count;                 /* number of mbs in a frame */

        /* Chroma subsampling */
        int     chroma_h_shift;
        int     chroma_v_shift;

        /* Strides */
        int     i_mb_stride;
        int     i_b8_stride;
        int     i_b4_stride;
        int     left_b8[2];
        int     left_b4[2];

        /* Current index */
        int     i_mb_x;
        int     i_mb_y;
        int     i_mb_xy;
        int     i_b8_xy;
        int     i_b4_xy;

        /* Search parameters */
        int     i_me_method;
        int     i_subpel_refine;
        int     b_chroma_me;
        int     b_trellis;
        int     b_noise_reduction;
        int     b_dct_decimate;
        int     i_psy_rd; /* Psy RD strength--fixed point value*/
        int     i_psy_trellis; /* Psy trellis strength--fixed point value*/

        int     b_interlaced;
        int     b_adaptive_mbaff; /* MBAFF+subme 0 requires non-adaptive MBAFF i.e. all field mbs */

        /* Allowed qpel MV range to stay within the picture + emulated edge pixels */
        int     mv_min[2];
        int     mv_max[2];
        int     mv_miny_row[3]; /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
        int     mv_maxy_row[3];
        /* Subpel MV range for motion search.
         * same mv_min/max but includes levels' i_mv_range. */
        int     mv_min_spel[2];
        int     mv_max_spel[2];
        int     mv_miny_spel_row[3];
        int     mv_maxy_spel_row[3];
        /* Fullpel MV range for motion search */
        int     mv_min_fpel[2];
        int     mv_max_fpel[2];
        int     mv_miny_fpel_row[3];
        int     mv_maxy_fpel_row[3];

        /* neighboring MBs */
        unsigned int i_neighbour;
        unsigned int i_neighbour8[4];       /* neighbours of each 8x8 or 4x4 block that are available */
        unsigned int i_neighbour4[16];      /* at the time the block is coded */
        unsigned int i_neighbour_intra;     /* for constrained intra pred */
        unsigned int i_neighbour_frame;     /* ignoring slice boundaries */
        int     i_mb_type_top;
        int     i_mb_type_left[2];
        int     i_mb_type_topleft;
        int     i_mb_type_topright;
        int     i_mb_prev_xy;
        int     i_mb_left_xy[2];
        int     i_mb_top_xy;
        int     i_mb_topleft_xy;
        int     i_mb_topright_xy;
        int     i_mb_top_y;
        int     i_mb_topleft_y;
        int     i_mb_topright_y;
        const x264_left_table_t *left_index_table;
        int     i_mb_top_mbpair_xy;
        int     topleft_partition;
        int     b_allow_skip;
        int     field_decoding_flag;

        /**** thread synchronization ends here ****/
        /* subsequent variables are either thread-local or constant,
         * and won't be copied from one thread to another */

        /* mb table */
        int8_t  *type;                      /* mb type */
        uint8_t *partition;                 /* mb partition */
        int8_t  *qp;                        /* mb qp */
        int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
        int8_t  (*intra4x4_pred_mode)[8];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
                                            /* actually has only 7 entries; set to 8 for write-combining optimizations */
        uint8_t (*non_zero_count)[16*3];    /* nzc. for I_PCM set to 16 */
        int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
        int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
        uint8_t (*mvd[2])[8][2];            /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
        int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only) */
        int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
        int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
        int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
        uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
                                             * NOTE: this will fail on resolutions above 2^16 MBs... */
        uint8_t *field;

         /* buffer for weighted versions of the reference frames */
        pixel *p_weight_buf[X264_REF_MAX];

        /* current value */
        int     i_type;
        int     i_partition;
        ALIGNED_4( uint8_t i_sub_partition[4] );
        int     b_transform_8x8;

        int     i_cbp_luma;
        int     i_cbp_chroma;

        int     i_intra16x16_pred_mode;
        int     i_chroma_pred_mode;

        /* skip flags for i4x4 and i8x8
         * 0 = encode as normal.
         * 1 (non-RD only) = the DCT is still in h->dct, restore fdec and skip reconstruction.
         * 2 (RD only) = the DCT has since been overwritten by RD; restore that too. */
        int i_skip_intra;
        /* skip flag for motion compensation */
        /* if we've already done MC, we don't need to do it again */
        int b_skip_mc;
        /* set to true if we are re-encoding a macroblock. */
        int b_reencode_mb;
        int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */
        int b_deblock_rdo;
        int b_overflow; /* If CAVLC had a level code overflow during bitstream writing. */

        struct
        {
            /* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
            ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
            ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] );

            /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
            ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
            ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
            ALIGNED_16( dctcoef i8x8_dct_buf[3][64] );
            ALIGNED_16( dctcoef i4x4_dct_buf[15][16] );
            uint32_t i4x4_nnz_buf[4];
            uint32_t i8x8_nnz_buf[4];
            int i4x4_cbp;
            int i8x8_cbp;

            /* Psy trellis DCT data */
            ALIGNED_16( dctcoef fenc_dct8[4][64] );
            ALIGNED_16( dctcoef fenc_dct4[16][16] );

            /* Psy RD SATD/SA8D scores cache */
            ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
            ALIGNED_16( uint32_t fenc_satd_cache[32] );

            /* pointer over mb of the frame to be compressed */
            pixel *p_fenc[3]; /* y,u,v */
            /* pointer to the actual source frame, not a block copy */
            pixel *p_fenc_plane[3];

            /* pointer over mb of the frame to be reconstructed  */
            pixel *p_fdec[3];

            /* pointer over mb of the references */
            int i_fref[2];
            /* [12]: yN, yH, yV, yHV, (NV12 ? uv : I444 ? (uN, uH, uV, uHV, vN, ...)) */
            pixel *p_fref[2][X264_REF_MAX*2][12];
            pixel *p_fref_w[X264_REF_MAX*2];  /* weighted fullpel luma */
            uint16_t *p_integral[2][X264_REF_MAX];

            /* fref stride */
            int     i_stride[3];
        } pic;

        /* cache */
        struct
        {
            /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
            ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );

            /* i_non_zero_count if available else 0x80 */
            ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] );

            /* -1 if unused, -2 if unavailable */
            ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );

            /* 0 if not available */
            ALIGNED_16( int16_t mv[2][X264_SCAN8_LUMA_SIZE][2] );
            ALIGNED_8( uint8_t mvd[2][X264_SCAN8_LUMA_SIZE][2] );

            /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
            ALIGNED_4( int8_t skip[X264_SCAN8_LUMA_SIZE] );

            ALIGNED_4( int16_t direct_mv[2][4][2] );
            ALIGNED_4( int8_t  direct_ref[2][4] );
            int     direct_partition;
            ALIGNED_4( int16_t pskip_mv[2] );

            /* number of neighbors (top and left) that used 8x8 dct */
            int     i_neighbour_transform_size;
            int     i_neighbour_skip;

            /* neighbor CBPs */
            int     i_cbp_top;
            int     i_cbp_left;

            /* extra data required for mbaff in mv prediction */
            int16_t topright_mv[2][3][2];
            int8_t  topright_ref[2][3];
        } cache;

        /* */
        int     i_qp;       /* current qp */
        int     i_chroma_qp;
        int     i_last_qp;  /* last qp */
        int     i_last_dqp; /* last delta qp */
        int     b_variable_qp; /* whether qp is allowed to vary per macroblock */
        int     b_lossless;
        int     b_direct_auto_read; /* take stats for --direct auto from the 2pass log */
        int     b_direct_auto_write; /* analyse direct modes, to use and/or save */

        /* lambda values */
        int     i_trellis_lambda2[2][2]; /* [luma,chroma][inter,intra] */
        int     i_psy_rd_lambda;
        int     i_chroma_lambda2_offset;

        /* B_direct and weighted prediction */
        int16_t dist_scale_factor_buf[2][2][X264_REF_MAX*2][4];
        int16_t (*dist_scale_factor)[4];
        int8_t bipred_weight_buf[2][2][X264_REF_MAX*2][4];
        int8_t (*bipred_weight)[4];
        /* maps fref1[0]'s ref indices into the current list0 */
#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
        int8_t  map_col_to_list0[X264_REF_MAX+2];
        int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
        int8_t deblock_ref_table[X264_REF_MAX*2+2];
#define deblock_ref_table(x) h->mb.deblock_ref_table[(x)+2]
    } mb;

    /* rate control encoding only */
    x264_ratecontrol_t *rc;

    /* stats */
    struct
    {
        /* Current frame stats */
        struct
        {
            /* MV bits (MV+Ref+Block Type) */
            int i_mv_bits;
            /* Texture bits (DCT coefs) */
            int i_tex_bits;
            /* ? */
            int i_misc_bits;
            /* MB type counts */
            int i_mb_count[19];
            int i_mb_count_i;
            int i_mb_count_p;
            int i_mb_count_skip;
            int i_mb_count_8x8dct[2];
            int i_mb_count_ref[2][X264_REF_MAX*2];
            int i_mb_partition[17];
            int i_mb_cbp[6];
            int i_mb_pred_mode[4][13];
            int i_mb_field[3];
            /* Adaptive direct mv pred */
            int i_direct_score[2];
            /* Metrics */
            int64_t i_ssd[3];
            double f_ssim;
            int i_ssim_cnt;
        } frame;

        /* Cumulated stats */

        /* per slice info */
        int     i_frame_count[3];
        int64_t i_frame_size[3];
        double  f_frame_qp[3];
        int     i_consecutive_bframes[X264_BFRAME_MAX+1];
        /* */
        double  f_ssd_global[3];
        double  f_psnr_average[3];
        double  f_psnr_mean_y[3];
        double  f_psnr_mean_u[3];
        double  f_psnr_mean_v[3];
        double  f_ssim_mean_y[3];
        double  f_frame_duration[3];
        /* */
        int64_t i_mb_count[3][19];
        int64_t i_mb_partition[2][17];
        int64_t i_mb_count_8x8dct[2];
        int64_t i_mb_count_ref[2][2][X264_REF_MAX*2];
        int64_t i_mb_cbp[6];
        int64_t i_mb_pred_mode[4][13];
        int64_t i_mb_field[3];
        /* */
        int     i_direct_score[2];
        int     i_direct_frames[2];
        /* num p-frames weighted */
        int     i_wpred[2];

    } stat;

    /* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4, 3 = chroma 8x8 */
    udctcoef (*nr_offset)[64];
    uint32_t (*nr_residual_sum)[64];
    uint32_t *nr_count;

    ALIGNED_16( udctcoef nr_offset_denoise[4][64] );
    ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] );
    uint32_t nr_count_buf[2][4];

    uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */

    /* Buffers that are allocated per-thread even in sliced threads. */
    void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
    pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
    /* Deblock strength values are stored for each 4x4 partition. In MBAFF
     * there are four extra values that need to be stored, located in [4][i]. */
    uint8_t (*deblock_strength[2])[2][8][4];

    /* CPU functions dependents */
    x264_predict_t      predict_16x16[4+3];
    x264_predict8x8_t   predict_8x8[9+3];
    x264_predict_t      predict_4x4[9+3];
    x264_predict_t      predict_chroma[4+3];
    x264_predict_t      predict_8x8c[4+3];
    x264_predict_t      predict_8x16c[4+3];
    x264_predict_8x8_filter_t predict_8x8_filter;

    x264_pixel_function_t pixf;
    x264_mc_functions_t   mc;
    x264_dct_function_t   dctf;
    x264_zigzag_function_t zigzagf;
    x264_zigzag_function_t zigzagf_interlaced;
    x264_zigzag_function_t zigzagf_progressive;
    x264_quant_function_t quantf;
    x264_deblock_function_t loopf;
    x264_bitstream_function_t bsf;

#if HAVE_VISUALIZE
    struct visualize_t *visualize;
#endif
    x264_lookahead_t *lookahead;
};

// included at the end because it needs x264_t
#include "macroblock.h"

#if ARCH_X86 || ARCH_X86_64
#include "x86/util.h"
#endif

#include "rectangle.h"

#endif

x264-snapshot-20120103-2245-stable/common/x86/0000755000175000001440000000000011700673342017606 5ustar  videolanusersx264-snapshot-20120103-2245-stable/common/x86/x86inc.asm0000644000175000001440000006346711700673342021447 0ustar  videolanusers;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Anton Mitrofanov <BugMaster@narod.ru>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* Permission to use, copy, modify, and/or distribute this software for any
;* purpose with or without fee is hereby granted, provided that the above
;* copyright notice and this permission notice appear in all copies.
;*
;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;*****************************************************************************

; This is a header file for the x264ASM assembly language, which uses
; NASM/YASM syntax combined with a large number of macros to provide easy
; abstraction between different calling conventions (x86_32, win64, linux64).
; It also has various other useful features to simplify writing the kind of
; DSP functions that are most often used in x264.

; Unlike the rest of x264, this file is available under an ISC license, as it
; has significant usefulness outside of x264 and we want it to be available
; to the largest audience possible.  Of course, if you modify it for your own
; purposes to add a new feature, we strongly encourage contributing a patch
; as this feature might be useful for others as well.  Send patches or ideas
; to x264-devel@videolan.org .

%define program_name x264

%ifdef ARCH_X86_64
    %ifidn __OUTPUT_FORMAT__,win32
        %define WIN64
    %else
        %define UNIX64
    %endif
%endif

%ifdef PREFIX
    %define mangle(x) _ %+ x
%else
    %define mangle(x) x
%endif

; FIXME: All of the 64bit asm functions that take a stride as an argument
; via register, assume that the high dword of that register is filled with 0.
; This is true in practice (since we never do any 64bit arithmetic on strides,
; and x264's strides are all positive), but is not guaranteed by the ABI.

; Name of the .rodata section.
; Kludge: Something on OS X fails to align .rodata even given an align attribute,
; so use a different read-only section.
%macro SECTION_RODATA 0-1 16
    %ifidn __OUTPUT_FORMAT__,macho64
        SECTION .text align=%1
    %elifidn __OUTPUT_FORMAT__,macho
        SECTION .text align=%1
        fakegot:
    %elifidn __OUTPUT_FORMAT__,aout
        section .text
    %else
        SECTION .rodata align=%1
    %endif
%endmacro

; aout does not support align=
%macro SECTION_TEXT 0-1 16
    %ifidn __OUTPUT_FORMAT__,aout
        SECTION .text
    %else
        SECTION .text align=%1
    %endif
%endmacro

%ifdef WIN64
    %define PIC
%elifndef ARCH_X86_64
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
    %undef PIC
%endif
%ifdef PIC
    default rel
%endif

; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
; covers most of x264's asm.

; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
; %4 = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal

; e.g.
; cglobal foo, 2,3,0, dst, src, tmp
; declares a function (foo), taking two args (dst and src) and one local variable (tmp)

; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle
; we need more flexible macro.

; RET:
; Pops anything that was pushed by PROLOGUE, and returns.

; REP_RET:
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
; which are slow when a normal ret follows a branch.

; registers:
; rN and rNq are the native-size register holding function argument N
; rNd, rNw, rNb are dword, word, and byte size
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size

%macro DECLARE_REG 6
    %define r%1q %2
    %define r%1d %3
    %define r%1w %4
    %define r%1b %5
    %define r%1m %6
    %ifid %6 ; i.e. it's a register
        %define r%1mp %2
    %elifdef ARCH_X86_64 ; memory
        %define r%1mp qword %6
    %else
        %define r%1mp dword %6
    %endif
    %define r%1  %2
%endmacro

%macro DECLARE_REG_SIZE 2
    %define r%1q r%1
    %define e%1q r%1
    %define r%1d e%1
    %define e%1d e%1
    %define r%1w %1
    %define e%1w %1
    %define r%1b %2
    %define e%1b %2
%ifndef ARCH_X86_64
    %define r%1  e%1
%endif
%endmacro

DECLARE_REG_SIZE ax, al
DECLARE_REG_SIZE bx, bl
DECLARE_REG_SIZE cx, cl
DECLARE_REG_SIZE dx, dl
DECLARE_REG_SIZE si, sil
DECLARE_REG_SIZE di, dil
DECLARE_REG_SIZE bp, bpl

; t# defines for when per-arch register allocation is more complex than just function arguments

%macro DECLARE_REG_TMP 1-*
    %assign %%i 0
    %rep %0
        CAT_XDEFINE t, %%i, r%1
        %assign %%i %%i+1
        %rotate 1
    %endrep
%endmacro

%macro DECLARE_REG_TMP_SIZE 0-*
    %rep %0
        %define t%1q t%1 %+ q
        %define t%1d t%1 %+ d
        %define t%1w t%1 %+ w
        %define t%1b t%1 %+ b
        %rotate 1
    %endrep
%endmacro

DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9

%ifdef ARCH_X86_64
    %define gprsize 8
%else
    %define gprsize 4
%endif

%macro PUSH 1
    push %1
    %assign stack_offset stack_offset+gprsize
%endmacro

%macro POP 1
    pop %1
    %assign stack_offset stack_offset-gprsize
%endmacro

%macro SUB 2
    sub %1, %2
    %ifidn %1, rsp
        %assign stack_offset stack_offset+(%2)
    %endif
%endmacro

%macro ADD 2
    add %1, %2
    %ifidn %1, rsp
        %assign stack_offset stack_offset-(%2)
    %endif
%endmacro

%macro movifnidn 2
    %ifnidn %1, %2
        mov %1, %2
    %endif
%endmacro

%macro movsxdifnidn 2
    %ifnidn %1, %2
        movsxd %1, %2
    %endif
%endmacro

%macro ASSERT 1
    %if (%1) == 0
        %error assert failed
    %endif
%endmacro

%macro DEFINE_ARGS 0-*
    %ifdef n_arg_names
        %assign %%i 0
        %rep n_arg_names
            CAT_UNDEF arg_name %+ %%i, q
            CAT_UNDEF arg_name %+ %%i, d
            CAT_UNDEF arg_name %+ %%i, w
            CAT_UNDEF arg_name %+ %%i, b
            CAT_UNDEF arg_name %+ %%i, m
            CAT_UNDEF arg_name, %%i
            %assign %%i %%i+1
        %endrep
    %endif

    %assign %%i 0
    %rep %0
        %xdefine %1q r %+ %%i %+ q
        %xdefine %1d r %+ %%i %+ d
        %xdefine %1w r %+ %%i %+ w
        %xdefine %1b r %+ %%i %+ b
        %xdefine %1m r %+ %%i %+ m
        CAT_XDEFINE arg_name, %%i, %1
        %assign %%i %%i+1
        %rotate 1
    %endrep
    %assign n_arg_names %%i
%endmacro

%ifdef WIN64 ; Windows x64 ;=================================================

DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
DECLARE_REG 1, rdx, edx, dx,  dl,  edx
DECLARE_REG 2, r8,  r8d, r8w, r8b, r8d
DECLARE_REG 3, r9,  r9d, r9w, r9b, r9d
DECLARE_REG 4, rdi, edi, di,  dil, [rsp + stack_offset + 40]
DECLARE_REG 5, rsi, esi, si,  sil, [rsp + stack_offset + 48]
DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
%define r7m [rsp + stack_offset + 64]
%define r8m [rsp + stack_offset + 72]

%macro LOAD_IF_USED 2 ; reg_id, number_of_args
    %if %1 < %2
        mov r%1, [rsp + stack_offset + 8 + %1*8]
    %endif
%endmacro

%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
    ASSERT %2 >= %1
    %assign regs_used %2
    ASSERT regs_used <= 7
    %if regs_used > 4
        push r4
        push r5
        %assign stack_offset stack_offset+16
    %endif
    WIN64_SPILL_XMM %3
    LOAD_IF_USED 4, %1
    LOAD_IF_USED 5, %1
    LOAD_IF_USED 6, %1
    DEFINE_ARGS %4
%endmacro

%macro WIN64_SPILL_XMM 1
    %assign xmm_regs_used %1
    %if mmsize == 8
        %assign xmm_regs_used 0
    %endif
    ASSERT xmm_regs_used <= 16
    %if xmm_regs_used > 6
        sub rsp, (xmm_regs_used-6)*16+16
        %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
        %assign %%i xmm_regs_used
        %rep (xmm_regs_used-6)
            %assign %%i %%i-1
            movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
        %endrep
    %endif
%endmacro

%macro WIN64_RESTORE_XMM_INTERNAL 1
    %if xmm_regs_used > 6
        %assign %%i xmm_regs_used
        %rep (xmm_regs_used-6)
            %assign %%i %%i-1
            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
        %endrep
        add %1, (xmm_regs_used-6)*16+16
    %endif
%endmacro

%macro WIN64_RESTORE_XMM 1
    WIN64_RESTORE_XMM_INTERNAL %1
    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
    %assign xmm_regs_used 0
%endmacro

%macro RET 0
    WIN64_RESTORE_XMM_INTERNAL rsp
    %if regs_used > 4
        pop r5
        pop r4
    %endif
    ret
%endmacro

%macro REP_RET 0
    %if regs_used > 4 || xmm_regs_used > 6
        RET
    %else
        rep ret
    %endif
%endmacro

%elifdef ARCH_X86_64 ; *nix x64 ;=============================================

DECLARE_REG 0, rdi, edi, di,  dil, edi
DECLARE_REG 1, rsi, esi, si,  sil, esi
DECLARE_REG 2, rdx, edx, dx,  dl,  edx
DECLARE_REG 3, rcx, ecx, cx,  cl,  ecx
DECLARE_REG 4, r8,  r8d, r8w, r8b, r8d
DECLARE_REG 5, r9,  r9d, r9w, r9b, r9d
DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 8]
%define r7m [rsp + stack_offset + 16]
%define r8m [rsp + stack_offset + 24]

%macro LOAD_IF_USED 2 ; reg_id, number_of_args
    %if %1 < %2
        mov r%1, [rsp - 40 + %1*8]
    %endif
%endmacro

%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
    ASSERT %2 >= %1
    ASSERT %2 <= 7
    LOAD_IF_USED 6, %1
    DEFINE_ARGS %4
%endmacro

%macro RET 0
    ret
%endmacro

%macro REP_RET 0
    rep ret
%endmacro

%else ; X86_32 ;==============================================================

DECLARE_REG 0, eax, eax, ax, al,   [esp + stack_offset + 4]
DECLARE_REG 1, ecx, ecx, cx, cl,   [esp + stack_offset + 8]
DECLARE_REG 2, edx, edx, dx, dl,   [esp + stack_offset + 12]
DECLARE_REG 3, ebx, ebx, bx, bl,   [esp + stack_offset + 16]
DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%define r7m [esp + stack_offset + 32]
%define r8m [esp + stack_offset + 36]
%define rsp esp

%macro PUSH_IF_USED 1 ; reg_id
    %if %1 < regs_used
        push r%1
        %assign stack_offset stack_offset+4
    %endif
%endmacro

%macro POP_IF_USED 1 ; reg_id
    %if %1 < regs_used
        pop r%1
    %endif
%endmacro

%macro LOAD_IF_USED 2 ; reg_id, number_of_args
    %if %1 < %2
        mov r%1, [esp + stack_offset + 4 + %1*4]
    %endif
%endmacro

%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
    ASSERT %2 >= %1
    %assign regs_used %2
    ASSERT regs_used <= 7
    PUSH_IF_USED 3
    PUSH_IF_USED 4
    PUSH_IF_USED 5
    PUSH_IF_USED 6
    LOAD_IF_USED 0, %1
    LOAD_IF_USED 1, %1
    LOAD_IF_USED 2, %1
    LOAD_IF_USED 3, %1
    LOAD_IF_USED 4, %1
    LOAD_IF_USED 5, %1
    LOAD_IF_USED 6, %1
    DEFINE_ARGS %4
%endmacro

%macro RET 0
    POP_IF_USED 6
    POP_IF_USED 5
    POP_IF_USED 4
    POP_IF_USED 3
    ret
%endmacro

%macro REP_RET 0
    %if regs_used > 3
        RET
    %else
        rep ret
    %endif
%endmacro

%endif ;======================================================================

%ifndef WIN64
%macro WIN64_SPILL_XMM 1
%endmacro
%macro WIN64_RESTORE_XMM 1
%endmacro
%endif



;=============================================================================
; arch-independent part
;=============================================================================

%assign function_align 16

; Begin a function.
; Applies any symbol mangling needed for C linkage, and sets up a define such that
; subsequent uses of the function name automatically refer to the mangled version.
; Appends cpuflags to the function name if cpuflags has been specified.
%macro cglobal 1-2+ ; name, [PROLOGUE args]
%if %0 == 1
    cglobal_internal %1 %+ SUFFIX
%else
    cglobal_internal %1 %+ SUFFIX, %2
%endif
%endmacro
%macro cglobal_internal 1-2+
    %ifndef cglobaled_%1
        %xdefine %1 mangle(program_name %+ _ %+ %1)
        %xdefine %1.skip_prologue %1 %+ .skip_prologue
        CAT_XDEFINE cglobaled_, %1, 1
    %endif
    %xdefine current_function %1
    %ifidn __OUTPUT_FORMAT__,elf
        global %1:function hidden
    %else
        global %1
    %endif
    align function_align
    %1:
    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
    %assign stack_offset 0
    %if %0 > 1
        PROLOGUE %2
    %endif
%endmacro

%macro cextern 1
    %xdefine %1 mangle(program_name %+ _ %+ %1)
    CAT_XDEFINE cglobaled_, %1, 1
    extern %1
%endmacro

; like cextern, but without the prefix
%macro cextern_naked 1
    %xdefine %1 mangle(%1)
    CAT_XDEFINE cglobaled_, %1, 1
    extern %1
%endmacro

%macro const 2+
    %xdefine %1 mangle(program_name %+ _ %+ %1)
    global %1
    %1: %2
%endmacro

; This is needed for ELF, otherwise the GNU linker assumes the stack is
; executable by default.
%ifidn __OUTPUT_FORMAT__,elf
SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif

; cpuflags

%assign cpuflags_mmx      (1<<0)
%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
%assign cpuflags_sse      (1<<2) | cpuflags_mmx2
%assign cpuflags_sse2     (1<<3) | cpuflags_sse
%assign cpuflags_sse2slow (1<<4) | cpuflags_sse2
%assign cpuflags_sse3     (1<<5) | cpuflags_sse2
%assign cpuflags_ssse3    (1<<6) | cpuflags_sse3
%assign cpuflags_sse4     (1<<7) | cpuflags_ssse3
%assign cpuflags_sse42    (1<<8) | cpuflags_sse4
%assign cpuflags_avx      (1<<9) | cpuflags_sse42
%assign cpuflags_xop      (1<<10)| cpuflags_avx
%assign cpuflags_fma4     (1<<11)| cpuflags_avx

%assign cpuflags_cache32  (1<<16)
%assign cpuflags_cache64  (1<<17)
%assign cpuflags_slowctz  (1<<18)
%assign cpuflags_lzcnt    (1<<19)
%assign cpuflags_misalign (1<<20)
%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant

%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))

; Takes up to 2 cpuflags from the above list.
; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
%macro INIT_CPUFLAGS 0-2
    %if %0 >= 1
        %xdefine cpuname %1
        %assign cpuflags cpuflags_%1
        %if %0 >= 2
            %xdefine cpuname %1_%2
            %assign cpuflags cpuflags | cpuflags_%2
        %endif
        %xdefine SUFFIX _ %+ cpuname
        %if cpuflag(avx)
            %assign avx_enabled 1
        %endif
        %if cpuflag(aligned)
            %define movu mova
        %elifidn %1, sse3
            %define movu lddqu
        %endif
    %else
        %xdefine SUFFIX
        %undef cpuname
        %undef cpuflags
    %endif
%endmacro

; merge mmx and sse*

%macro CAT_XDEFINE 3
    %xdefine %1%2 %3
%endmacro

%macro CAT_UNDEF 2
    %undef %1%2
%endmacro

%macro INIT_MMX 0-1+
    %assign avx_enabled 0
    %define RESET_MM_PERMUTATION INIT_MMX %1
    %define mmsize 8
    %define num_mmregs 8
    %define mova movq
    %define movu movq
    %define movh movd
    %define movnta movntq
    %assign %%i 0
    %rep 8
    CAT_XDEFINE m, %%i, mm %+ %%i
    CAT_XDEFINE nmm, %%i, %%i
    %assign %%i %%i+1
    %endrep
    %rep 8
    CAT_UNDEF m, %%i
    CAT_UNDEF nmm, %%i
    %assign %%i %%i+1
    %endrep
    INIT_CPUFLAGS %1
%endmacro

%macro INIT_XMM 0-1+
    %assign avx_enabled 0
    %define RESET_MM_PERMUTATION INIT_XMM %1
    %define mmsize 16
    %define num_mmregs 8
    %ifdef ARCH_X86_64
    %define num_mmregs 16
    %endif
    %define mova movdqa
    %define movu movdqu
    %define movh movq
    %define movnta movntdq
    %assign %%i 0
    %rep num_mmregs
    CAT_XDEFINE m, %%i, xmm %+ %%i
    CAT_XDEFINE nxmm, %%i, %%i
    %assign %%i %%i+1
    %endrep
    INIT_CPUFLAGS %1
%endmacro

%macro INIT_YMM 0-1+
    %assign avx_enabled 1
    %define RESET_MM_PERMUTATION INIT_YMM %1
    %define mmsize 32
    %define num_mmregs 8
    %ifdef ARCH_X86_64
    %define num_mmregs 16
    %endif
    %define mova vmovaps
    %define movu vmovups
    %undef movh
    %undef movnta
    %assign %%i 0
    %rep num_mmregs
    CAT_XDEFINE m, %%i, ymm %+ %%i
    CAT_XDEFINE nymm, %%i, %%i
    %assign %%i %%i+1
    %endrep
    INIT_CPUFLAGS %1
%endmacro

INIT_XMM

; I often want to use macros that permute their arguments. e.g. there's no
; efficient way to implement butterfly or transpose or dct without swapping some
; arguments.
;
; I would like to not have to manually keep track of the permutations:
; If I insert a permutation in the middle of a function, it should automatically
; change everything that follows. For more complex macros I may also have multiple
; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
;
; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
; permutes its arguments. It's equivalent to exchanging the contents of the
; registers, except that this way you exchange the register names instead, so it
; doesn't cost any cycles.

%macro PERMUTE 2-* ; takes a list of pairs to swap
%rep %0/2
    %xdefine tmp%2 m%2
    %xdefine ntmp%2 nm%2
    %rotate 2
%endrep
%rep %0/2
    %xdefine m%1 tmp%2
    %xdefine nm%1 ntmp%2
    %undef tmp%2
    %undef ntmp%2
    %rotate 2
%endrep
%endmacro

%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
%rep %0-1
%ifdef m%1
    %xdefine tmp m%1
    %xdefine m%1 m%2
    %xdefine m%2 tmp
    CAT_XDEFINE n, m%1, %1
    CAT_XDEFINE n, m%2, %2
%else
    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
    ; Be careful using this mode in nested macros though, as in some cases there may be
    ; other copies of m# that have already been dereferenced and don't get updated correctly.
    %xdefine %%n1 n %+ %1
    %xdefine %%n2 n %+ %2
    %xdefine tmp m %+ %%n1
    CAT_XDEFINE m, %%n1, m %+ %%n2
    CAT_XDEFINE m, %%n2, tmp
    CAT_XDEFINE n, m %+ %%n1, %%n1
    CAT_XDEFINE n, m %+ %%n2, %%n2
%endif
    %undef tmp
    %rotate 1
%endrep
%endmacro

; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
; calls to that function will automatically load the permutation, so values can
; be returned in mmregs.
%macro SAVE_MM_PERMUTATION 0-1
    %if %0
        %xdefine %%f %1_m
    %else
        %xdefine %%f current_function %+ _m
    %endif
    %assign %%i 0
    %rep num_mmregs
        CAT_XDEFINE %%f, %%i, m %+ %%i
    %assign %%i %%i+1
    %endrep
%endmacro

%macro LOAD_MM_PERMUTATION 1 ; name to load from
    %ifdef %1_m0
        %assign %%i 0
        %rep num_mmregs
            CAT_XDEFINE m, %%i, %1_m %+ %%i
            CAT_XDEFINE n, m %+ %%i, %%i
        %assign %%i %%i+1
        %endrep
    %endif
%endmacro

; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
%macro call 1
    call_internal %1, %1 %+ SUFFIX
%endmacro
%macro call_internal 2
    %xdefine %%i %1
    %ifndef cglobaled_%1
        %ifdef cglobaled_%2
            %xdefine %%i %2
        %endif
    %endif
    call %%i
    LOAD_MM_PERMUTATION %%i
%endmacro

; Substitutions that reduce instruction size but are functionally equivalent
%macro add 2
    %ifnum %2
        %if %2==128
            sub %1, -128
        %else
            add %1, %2
        %endif
    %else
        add %1, %2
    %endif
%endmacro

%macro sub 2
    %ifnum %2
        %if %2==128
            add %1, -128
        %else
            sub %1, %2
        %endif
    %else
        sub %1, %2
    %endif
%endmacro

;=============================================================================
; AVX abstraction layer
;=============================================================================

%assign i 0
%rep 16
    %if i < 8
        CAT_XDEFINE sizeofmm, i, 8
    %endif
    CAT_XDEFINE sizeofxmm, i, 16
    CAT_XDEFINE sizeofymm, i, 32
%assign i i+1
%endrep
%undef i

;%1 == instruction
;%2 == 1 if float, 0 if int
;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
;%4 == number of operands given
;%5+: operands
%macro RUN_AVX_INSTR 6-7+
    %if sizeof%5==32
        v%1 %5, %6, %7
    %else
        %if sizeof%5==8
            %define %%regmov movq
        %elif %2
            %define %%regmov movaps
        %else
            %define %%regmov movdqa
        %endif

        %if %4>=3+%3
            %ifnidn %5, %6
                %if avx_enabled && sizeof%5==16
                    v%1 %5, %6, %7
                %else
                    %%regmov %5, %6
                    %1 %5, %7
                %endif
            %else
                %1 %5, %7
            %endif
        %elif %3
            %1 %5, %6, %7
        %else
            %1 %5, %6
        %endif
    %endif
%endmacro

;%1 == instruction
;%2 == 1 if float, 0 if int
;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
%macro AVX_INSTR 3
    %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
        %ifidn %3, fnord
            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
        %elifidn %4, fnord
            RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
        %elifidn %5, fnord
            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
        %else
            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
        %endif
    %endmacro
%endmacro

AVX_INSTR addpd, 1, 0
AVX_INSTR addps, 1, 0
AVX_INSTR addsd, 1, 0
AVX_INSTR addss, 1, 0
AVX_INSTR addsubpd, 1, 0
AVX_INSTR addsubps, 1, 0
AVX_INSTR andpd, 1, 0
AVX_INSTR andps, 1, 0
AVX_INSTR andnpd, 1, 0
AVX_INSTR andnps, 1, 0
AVX_INSTR blendpd, 1, 0
AVX_INSTR blendps, 1, 0
AVX_INSTR blendvpd, 1, 0
AVX_INSTR blendvps, 1, 0
AVX_INSTR cmppd, 1, 0
AVX_INSTR cmpps, 1, 0
AVX_INSTR cmpsd, 1, 0
AVX_INSTR cmpss, 1, 0
AVX_INSTR divpd, 1, 0
AVX_INSTR divps, 1, 0
AVX_INSTR divsd, 1, 0
AVX_INSTR divss, 1, 0
AVX_INSTR dppd, 1, 0
AVX_INSTR dpps, 1, 0
AVX_INSTR haddpd, 1, 0
AVX_INSTR haddps, 1, 0
AVX_INSTR hsubpd, 1, 0
AVX_INSTR hsubps, 1, 0
AVX_INSTR maxpd, 1, 0
AVX_INSTR maxps, 1, 0
AVX_INSTR maxsd, 1, 0
AVX_INSTR maxss, 1, 0
AVX_INSTR minpd, 1, 0
AVX_INSTR minps, 1, 0
AVX_INSTR minsd, 1, 0
AVX_INSTR minss, 1, 0
AVX_INSTR movsd, 1, 0
AVX_INSTR movss, 1, 0
AVX_INSTR mpsadbw, 0, 1
AVX_INSTR mulpd, 1, 0
AVX_INSTR mulps, 1, 0
AVX_INSTR mulsd, 1, 0
AVX_INSTR mulss, 1, 0
AVX_INSTR orpd, 1, 0
AVX_INSTR orps, 1, 0
AVX_INSTR packsswb, 0, 0
AVX_INSTR packssdw, 0, 0
AVX_INSTR packuswb, 0, 0
AVX_INSTR packusdw, 0, 0
AVX_INSTR paddb, 0, 0
AVX_INSTR paddw, 0, 0
AVX_INSTR paddd, 0, 0
AVX_INSTR paddq, 0, 0
AVX_INSTR paddsb, 0, 0
AVX_INSTR paddsw, 0, 0
AVX_INSTR paddusb, 0, 0
AVX_INSTR paddusw, 0, 0
AVX_INSTR palignr, 0, 1
AVX_INSTR pand, 0, 0
AVX_INSTR pandn, 0, 0
AVX_INSTR pavgb, 0, 0
AVX_INSTR pavgw, 0, 0
AVX_INSTR pblendvb, 0, 0
AVX_INSTR pblendw, 0, 1
AVX_INSTR pcmpestri, 0, 0
AVX_INSTR pcmpestrm, 0, 0
AVX_INSTR pcmpistri, 0, 0
AVX_INSTR pcmpistrm, 0, 0
AVX_INSTR pcmpeqb, 0, 0
AVX_INSTR pcmpeqw, 0, 0
AVX_INSTR pcmpeqd, 0, 0
AVX_INSTR pcmpeqq, 0, 0
AVX_INSTR pcmpgtb, 0, 0
AVX_INSTR pcmpgtw, 0, 0
AVX_INSTR pcmpgtd, 0, 0
AVX_INSTR pcmpgtq, 0, 0
AVX_INSTR phaddw, 0, 0
AVX_INSTR phaddd, 0, 0
AVX_INSTR phaddsw, 0, 0
AVX_INSTR phsubw, 0, 0
AVX_INSTR phsubd, 0, 0
AVX_INSTR phsubsw, 0, 0
AVX_INSTR pmaddwd, 0, 0
AVX_INSTR pmaddubsw, 0, 0
AVX_INSTR pmaxsb, 0, 0
AVX_INSTR pmaxsw, 0, 0
AVX_INSTR pmaxsd, 0, 0
AVX_INSTR pmaxub, 0, 0
AVX_INSTR pmaxuw, 0, 0
AVX_INSTR pmaxud, 0, 0
AVX_INSTR pminsb, 0, 0
AVX_INSTR pminsw, 0, 0
AVX_INSTR pminsd, 0, 0
AVX_INSTR pminub, 0, 0
AVX_INSTR pminuw, 0, 0
AVX_INSTR pminud, 0, 0
AVX_INSTR pmulhuw, 0, 0
AVX_INSTR pmulhrsw, 0, 0
AVX_INSTR pmulhw, 0, 0
AVX_INSTR pmullw, 0, 0
AVX_INSTR pmulld, 0, 0
AVX_INSTR pmuludq, 0, 0
AVX_INSTR pmuldq, 0, 0
AVX_INSTR por, 0, 0
AVX_INSTR psadbw, 0, 0
AVX_INSTR pshufb, 0, 0
AVX_INSTR psignb, 0, 0
AVX_INSTR psignw, 0, 0
AVX_INSTR psignd, 0, 0
AVX_INSTR psllw, 0, 0
AVX_INSTR pslld, 0, 0
AVX_INSTR psllq, 0, 0
AVX_INSTR pslldq, 0, 0
AVX_INSTR psraw, 0, 0
AVX_INSTR psrad, 0, 0
AVX_INSTR psrlw, 0, 0
AVX_INSTR psrld, 0, 0
AVX_INSTR psrlq, 0, 0
AVX_INSTR psrldq, 0, 0
AVX_INSTR psubb, 0, 0
AVX_INSTR psubw, 0, 0
AVX_INSTR psubd, 0, 0
AVX_INSTR psubq, 0, 0
AVX_INSTR psubsb, 0, 0
AVX_INSTR psubsw, 0, 0
AVX_INSTR psubusb, 0, 0
AVX_INSTR psubusw, 0, 0
AVX_INSTR punpckhbw, 0, 0
AVX_INSTR punpckhwd, 0, 0
AVX_INSTR punpckhdq, 0, 0
AVX_INSTR punpckhqdq, 0, 0
AVX_INSTR punpcklbw, 0, 0
AVX_INSTR punpcklwd, 0, 0
AVX_INSTR punpckldq, 0, 0
AVX_INSTR punpcklqdq, 0, 0
AVX_INSTR pxor, 0, 0
AVX_INSTR shufps, 0, 1
AVX_INSTR subpd, 1, 0
AVX_INSTR subps, 1, 0
AVX_INSTR subsd, 1, 0
AVX_INSTR subss, 1, 0
AVX_INSTR unpckhpd, 1, 0
AVX_INSTR unpckhps, 1, 0
AVX_INSTR unpcklpd, 1, 0
AVX_INSTR unpcklps, 1, 0
AVX_INSTR xorpd, 1, 0
AVX_INSTR xorps, 1, 0

; 3DNow instructions, for sharing code between AVX, SSE and 3DN
AVX_INSTR pfadd, 1, 0
AVX_INSTR pfsub, 1, 0
AVX_INSTR pfmul, 1, 0

; base-4 constants for shuffles
%assign i 0
%rep 256
    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
    %if j < 10
        CAT_XDEFINE q000, j, i
    %elif j < 100
        CAT_XDEFINE q00, j, i
    %elif j < 1000
        CAT_XDEFINE q0, j, i
    %else
        CAT_XDEFINE q, j, i
    %endif
%assign i i+1
%endrep
%undef i
%undef j

%macro FMA_INSTR 3
    %macro %1 4-7 %1, %2, %3
        %if cpuflag(xop)
            v%5 %1, %2, %3, %4
        %else
            %6 %1, %2, %3
            %7 %1, %4
        %endif
    %endmacro
%endmacro

FMA_INSTR  pmacsdd,  pmulld, paddd
FMA_INSTR  pmacsww,  pmullw, paddw
FMA_INSTR pmadcswd, pmaddwd, paddd
x264-snapshot-20120103-2245-stable/common/x86/x86util.asm0000644000175000001440000004265611700673342021650 0ustar  videolanusers;*****************************************************************************
;* x86util.asm: x86 utility macros
;*****************************************************************************
;* Copyright (C) 2008-2011 x264 project
;*
;* Authors: Holger Lubitz <holger@lubitz.org>
;*          Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%assign FENC_STRIDE 16
%assign FDEC_STRIDE 32

%assign SIZEOF_PIXEL 1
%assign SIZEOF_DCTCOEF 2
%define pixel byte
%ifdef HIGH_BIT_DEPTH
    %assign SIZEOF_PIXEL 2
    %assign SIZEOF_DCTCOEF 4
    %define pixel word
%endif

%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
%assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE

%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)

%macro FIX_STRIDES 1-*
%ifdef HIGH_BIT_DEPTH
%rep %0
    add %1, %1
    %rotate 1
%endrep
%endif
%endmacro


%macro SBUTTERFLY 4
%if avx_enabled && mmsize == 16
    punpckh%1 m%4, m%2, m%3
    punpckl%1 m%2, m%3
%else
    mova      m%4, m%2
    punpckl%1 m%2, m%3
    punpckh%1 m%4, m%3
%endif
    SWAP %3, %4
%endmacro

%macro SBUTTERFLY2 4
    punpckl%1 m%4, m%2, m%3
    punpckh%1 m%2, m%2, m%3
    SWAP %2, %4, %3
%endmacro

%macro TRANSPOSE4x4W 5
    SBUTTERFLY wd, %1, %2, %5
    SBUTTERFLY wd, %3, %4, %5
    SBUTTERFLY dq, %1, %3, %5
    SBUTTERFLY dq, %2, %4, %5
    SWAP %2, %3
%endmacro

%macro TRANSPOSE2x4x4W 5
    SBUTTERFLY wd,  %1, %2, %5
    SBUTTERFLY wd,  %3, %4, %5
    SBUTTERFLY dq,  %1, %3, %5
    SBUTTERFLY dq,  %2, %4, %5
    SBUTTERFLY qdq, %1, %2, %5
    SBUTTERFLY qdq, %3, %4, %5
%endmacro

%macro TRANSPOSE4x4D 5
    SBUTTERFLY dq,  %1, %2, %5
    SBUTTERFLY dq,  %3, %4, %5
    SBUTTERFLY qdq, %1, %3, %5
    SBUTTERFLY qdq, %2, %4, %5
    SWAP %2, %3
%endmacro

%macro TRANSPOSE8x8W 9-11
%ifdef ARCH_X86_64
    SBUTTERFLY wd,  %1, %2, %9
    SBUTTERFLY wd,  %3, %4, %9
    SBUTTERFLY wd,  %5, %6, %9
    SBUTTERFLY wd,  %7, %8, %9
    SBUTTERFLY dq,  %1, %3, %9
    SBUTTERFLY dq,  %2, %4, %9
    SBUTTERFLY dq,  %5, %7, %9
    SBUTTERFLY dq,  %6, %8, %9
    SBUTTERFLY qdq, %1, %5, %9
    SBUTTERFLY qdq, %2, %6, %9
    SBUTTERFLY qdq, %3, %7, %9
    SBUTTERFLY qdq, %4, %8, %9
    SWAP %2, %5
    SWAP %4, %7
%else
; in:  m0..m7, unless %11 in which case m6 is in %9
; out: m0..m7, unless %11 in which case m4 is in %10
; spills into %9 and %10
%if %0<11
    movdqa %9, m%7
%endif
    SBUTTERFLY wd,  %1, %2, %7
    movdqa %10, m%2
    movdqa m%7, %9
    SBUTTERFLY wd,  %3, %4, %2
    SBUTTERFLY wd,  %5, %6, %2
    SBUTTERFLY wd,  %7, %8, %2
    SBUTTERFLY dq,  %1, %3, %2
    movdqa %9, m%3
    movdqa m%2, %10
    SBUTTERFLY dq,  %2, %4, %3
    SBUTTERFLY dq,  %5, %7, %3
    SBUTTERFLY dq,  %6, %8, %3
    SBUTTERFLY qdq, %1, %5, %3
    SBUTTERFLY qdq, %2, %6, %3
    movdqa %10, m%2
    movdqa m%3, %9
    SBUTTERFLY qdq, %3, %7, %2
    SBUTTERFLY qdq, %4, %8, %2
    SWAP %2, %5
    SWAP %4, %7
%if %0<11
    movdqa m%5, %10
%endif
%endif
%endmacro

%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
%if cpuflag(ssse3)
    pabsw   %1, %2
%elifidn %3, sign ; version for pairing with PSIGNW: modifies src
    pxor    %1, %1
    pcmpgtw %1, %2
    pxor    %2, %1
    psubw   %2, %1
    SWAP    %1, %2
%elifidn %1, %2
    pxor    %3, %3
    psubw   %3, %1
    pmaxsw  %1, %3
%elifid %2
    pxor    %1, %1
    psubw   %1, %2
    pmaxsw  %1, %2
%elif %0 == 2
    pxor    %1, %1
    psubw   %1, %2
    pmaxsw  %1, %2
%else
    mova    %1, %2
    pxor    %3, %3
    psubw   %3, %1
    pmaxsw  %1, %3
%endif
%endmacro

%macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp
%if cpuflag(ssse3)
    pabsw   %1, %3
    pabsw   %2, %4
%elifidn %1, %3
    pxor    %5, %5
    pxor    %6, %6
    psubw   %5, %1
    psubw   %6, %2
    pmaxsw  %1, %5
    pmaxsw  %2, %6
%else
    pxor    %1, %1
    pxor    %2, %2
    psubw   %1, %3
    psubw   %2, %4
    pmaxsw  %1, %3
    pmaxsw  %2, %4
%endif
%endmacro

%macro ABSB 2
%if cpuflag(ssse3)
    pabsb   %1, %1
%else
    pxor    %2, %2
    psubb   %2, %1
    pminub  %1, %2
%endif
%endmacro

%macro ABSD 2
%if cpuflag(ssse3)
    pabsd   %1, %2
%else
    pxor    %1, %1
    pcmpgtd %1, %2
    pxor    %2, %1
    psubd   %2, %1
    SWAP    %1, %2
%endif
%endmacro

%macro PSIGN 3-4
%if cpuflag(ssse3) && %0 == 4
    psign%1 %2, %3, %4
%elif cpuflag(ssse3)
    psign%1 %2, %3
%elif %0 == 4
    pxor    %2, %3, %4
    psub%1  %2, %4
%else
    pxor    %2, %3
    psub%1  %2, %3
%endif
%endmacro

%define PSIGNW PSIGN w,
%define PSIGND PSIGN d,

%macro SPLATB_LOAD 3
%if cpuflag(ssse3)
    movd      %1, [%2-3]
    pshufb    %1, %3
%else
    movd      %1, [%2-3] ;to avoid crossing a cacheline
    punpcklbw %1, %1
    SPLATW    %1, %1, 3
%endif
%endmacro

%imacro SPLATW 2-3 0
    PSHUFLW    %1, %2, (%3)*q1111
%if mmsize == 16
    punpcklqdq %1, %1
%endif
%endmacro

%imacro SPLATD 2-3 0
%if mmsize == 16
    pshufd %1, %2, (%3)*q1111
%else
    pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010
%endif
%endmacro

%macro CLIPW 3 ;(dst, min, max)
    pmaxsw %1, %2
    pminsw %1, %3
%endmacro

%macro HADDD 2 ; sum junk
%if mmsize == 16
    movhlps %2, %1
    paddd   %1, %2
%endif
    PSHUFLW %2, %1, q0032
    paddd   %1, %2
%endmacro

%macro HADDW 2
%if cpuflag(xop) && mmsize == 16
    vphaddwq  %1, %1
    movhlps   %2, %1
    paddd     %1, %2
%else
    pmaddwd %1, [pw_1]
    HADDD   %1, %2
%endif
%endmacro

%macro HADDUW 2
%if cpuflag(xop) && mmsize == 16
    vphadduwq %1, %1
    movhlps   %2, %1
    paddd     %1, %2
%else
    psrld %2, %1, 16
    pslld %1, 16
    psrld %1, 16
    paddd %1, %2
    HADDD %1, %2
%endif
%endmacro

%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
%if cpuflag(ssse3)
    %if %0==5
        palignr %1, %2, %3, %4
    %else
        palignr %1, %2, %3
    %endif
%else
    %define %%dst %1
    %if %0==5
        %ifnidn %1, %2
            mova %%dst, %2
        %endif
        %rotate 1
    %endif
    %ifnidn %4, %2
        mova %4, %2
    %endif
    %if mmsize==8
        psllq  %%dst, (8-%3)*8
        psrlq  %4, %3*8
    %else
        pslldq %%dst, 16-%3
        psrldq %4, %3
    %endif
    por %%dst, %4
%endif
%endmacro

%macro PSHUFLW 1+
    %if mmsize == 8
        pshufw %1
    %else
        pshuflw %1
    %endif
%endmacro

; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes
; values shifted in are undefined
; faster if dst==src
%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift
%define PSRLPIX PSXLPIX r,  1, ;dst, src, shift
%macro PSXLPIX 5
    %if mmsize == 8
        %if %5&1
            ps%1lq %3, %4, %5*8
        %else
            pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff
        %endif
    %else
        ps%1ldq %3, %4, %5*2
    %endif
%endmacro

%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
%ifnum %5
    pand   m%3, m%5, m%4 ; src .. y6 .. y4
    pand   m%1, m%5, m%2 ; dst .. y6 .. y4
%else
    mova   m%1, %5
    pand   m%3, m%1, m%4 ; src .. y6 .. y4
    pand   m%1, m%1, m%2 ; dst .. y6 .. y4
%endif
    psrlw  m%2, 8        ; dst .. y7 .. y5
    psrlw  m%4, 8        ; src .. y7 .. y5
%endmacro

%macro SUMSUB_BA 3-4
%if %0==3
    padd%1  m%2, m%3
    padd%1  m%3, m%3
    psub%1  m%3, m%2
%elif avx_enabled
    padd%1  m%4, m%2, m%3
    psub%1  m%3, m%2
    SWAP    %2, %4
%else
    mova    m%4, m%2
    padd%1  m%2, m%3
    psub%1  m%3, m%4
%endif
%endmacro

%macro SUMSUB_BADC 5-6
%if %0==6
    SUMSUB_BA %1, %2, %3, %6
    SUMSUB_BA %1, %4, %5, %6
%else
    padd%1  m%2, m%3
    padd%1  m%4, m%5
    padd%1  m%3, m%3
    padd%1  m%5, m%5
    psub%1  m%3, m%2
    psub%1  m%5, m%4
%endif
%endmacro

%macro HADAMARD4_V 4+
    SUMSUB_BADC w, %1, %2, %3, %4
    SUMSUB_BADC w, %1, %3, %2, %4
%endmacro

%macro HADAMARD8_V 8+
    SUMSUB_BADC w, %1, %2, %3, %4
    SUMSUB_BADC w, %5, %6, %7, %8
    SUMSUB_BADC w, %1, %3, %2, %4
    SUMSUB_BADC w, %5, %7, %6, %8
    SUMSUB_BADC w, %1, %5, %2, %6
    SUMSUB_BADC w, %3, %7, %4, %8
%endmacro

%macro TRANS_SSE2 5-6
; TRANSPOSE2x2
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
; %2: ord/unord (for compat with sse4, unused)
; %3/%4: source regs
; %5/%6: tmp regs
%ifidn %1, d
%define mask [mask_10]
%define shift 16
%elifidn %1, q
%define mask [mask_1100]
%define shift 32
%endif
%if %0==6 ; less dependency if we have two tmp
    mova   m%5, mask   ; ff00
    mova   m%6, m%4    ; x5x4
    psll%1 m%4, shift  ; x4..
    pand   m%6, m%5    ; x5..
    pandn  m%5, m%3    ; ..x0
    psrl%1 m%3, shift  ; ..x1
    por    m%4, m%5    ; x4x0
    por    m%3, m%6    ; x5x1
%else ; more dependency, one insn less. sometimes faster, sometimes not
    mova   m%5, m%4    ; x5x4
    psll%1 m%4, shift  ; x4..
    pxor   m%4, m%3    ; (x4^x1)x0
    pand   m%4, mask   ; (x4^x1)..
    pxor   m%3, m%4    ; x4x0
    psrl%1 m%4, shift  ; ..(x1^x4)
    pxor   m%5, m%4    ; x5x1
    SWAP   %4, %3, %5
%endif
%endmacro

%macro TRANS_SSE4 5-6 ; see above
%ifidn %1, d
%ifidn %2, ord
    psrl%1  m%5, m%3, 16
    pblendw m%5, m%4, q2222
    psll%1  m%4, 16
    pblendw m%4, m%3, q1111
    SWAP     %3, %5
%else
%if avx_enabled
    pblendw m%5, m%3, m%4, q2222
    SWAP     %3, %5
%else
    mova    m%5, m%3
    pblendw m%3, m%4, q2222
%endif
    psll%1  m%4, 16
    psrl%1  m%5, 16
    por     m%4, m%5
%endif
%elifidn %1, q
    shufps m%5, m%3, m%4, q3131
    shufps m%3, m%4, q2020
    SWAP    %4, %5
%endif
%endmacro

%macro TRANS_XOP 5-6
%ifidn %1, d
    vpperm m%5, m%3, m%4, [transd_shuf1]
    vpperm m%3, m%3, m%4, [transd_shuf2]
%elifidn %1, q
    shufps m%5, m%3, m%4, q3131
    shufps m%3, m%4, q2020
%endif
    SWAP    %4, %5
%endmacro

%macro HADAMARD 5-6
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
; %3/%4: regs
; %5(%6): tmpregs
%if %1!=0 ; have to reorder stuff for horizontal op
    %ifidn %2, sumsub
         %define ORDER ord
         ; sumsub needs order because a-b != b-a unless a=b
    %else
         %define ORDER unord
         ; if we just max, order doesn't matter (allows pblendw+or in sse4)
    %endif
    %if %1==1
         TRANS d, ORDER, %3, %4, %5, %6
    %elif %1==2
         %if mmsize==8
             SBUTTERFLY dq, %3, %4, %5
         %else
             TRANS q, ORDER, %3, %4, %5, %6
         %endif
    %elif %1==4
         SBUTTERFLY qdq, %3, %4, %5
    %endif
%endif
%ifidn %2, sumsub
    SUMSUB_BA w, %3, %4, %5
%else
    %ifidn %2, amax
        %if %0==6
            ABSW2 m%3, m%4, m%3, m%4, m%5, m%6
        %else
            ABSW m%3, m%3, m%5
            ABSW m%4, m%4, m%5
        %endif
    %endif
    pmaxsw m%3, m%4
%endif
%endmacro


%macro HADAMARD2_2D 6-7 sumsub
    HADAMARD 0, sumsub, %1, %2, %5
    HADAMARD 0, sumsub, %3, %4, %5
    SBUTTERFLY %6, %1, %2, %5
%ifnum %7
    HADAMARD 0, amax, %1, %2, %5, %7
%else
    HADAMARD 0, %7, %1, %2, %5
%endif
    SBUTTERFLY %6, %3, %4, %5
%ifnum %7
    HADAMARD 0, amax, %3, %4, %5, %7
%else
    HADAMARD 0, %7, %3, %4, %5
%endif
%endmacro

%macro HADAMARD4_2D 5-6 sumsub
    HADAMARD2_2D %1, %2, %3, %4, %5, wd
    HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
    SWAP %2, %3
%endmacro

%macro HADAMARD4_2D_SSE 5-6 sumsub
    HADAMARD  0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
    HADAMARD  0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
    SBUTTERFLY   wd, %1, %2, %5     ; %1: m0 1+0 %2: m1 1+0
    SBUTTERFLY   wd, %3, %4, %5     ; %3: m0 3+2 %4: m1 3+2
    HADAMARD2_2D %1, %3, %2, %4, %5, dq
    SBUTTERFLY  qdq, %1, %2, %5
    HADAMARD  0, %6, %1, %2, %5     ; 2nd H m1/m0 row 0+1
    SBUTTERFLY  qdq, %3, %4, %5
    HADAMARD  0, %6, %3, %4, %5     ; 2nd H m1/m0 row 2+3
%endmacro

%macro HADAMARD8_2D 9-10 sumsub
    HADAMARD2_2D %1, %2, %3, %4, %9, wd
    HADAMARD2_2D %5, %6, %7, %8, %9, wd
    HADAMARD2_2D %1, %3, %2, %4, %9, dq
    HADAMARD2_2D %5, %7, %6, %8, %9, dq
    HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
    HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
%ifnidn %10, amax
    SWAP %2, %5
    SWAP %4, %7
%endif
%endmacro

; doesn't include the "pmaddubsw hmul_8p" pass
%macro HADAMARD8_2D_HMUL 10
    HADAMARD4_V %1, %2, %3, %4, %9
    HADAMARD4_V %5, %6, %7, %8, %9
    SUMSUB_BADC w, %1, %5, %2, %6, %9
    HADAMARD 2, sumsub, %1, %5, %9, %10
    HADAMARD 2, sumsub, %2, %6, %9, %10
    SUMSUB_BADC w, %3, %7, %4, %8, %9
    HADAMARD 2, sumsub, %3, %7, %9, %10
    HADAMARD 2, sumsub, %4, %8, %9, %10
    HADAMARD 1, amax, %1, %5, %9, %10
    HADAMARD 1, amax, %2, %6, %9, %5
    HADAMARD 1, amax, %3, %7, %9, %5
    HADAMARD 1, amax, %4, %8, %9, %5
%endmacro

%macro SUMSUB2_AB 4
%ifnum %3
    psub%1  m%4, m%2, m%3
    psub%1  m%4, m%3
    padd%1  m%2, m%2
    padd%1  m%2, m%3
%else
    mova    m%4, m%2
    padd%1  m%2, m%2
    padd%1  m%2, %3
    psub%1  m%4, %3
    psub%1  m%4, %3
%endif
%endmacro

%macro SUMSUB2_BA 4
%if avx_enabled
    padd%1  m%4, m%2, m%3
    padd%1  m%4, m%3
    psub%1  m%3, m%2
    psub%1  m%3, m%2
    SWAP     %2,  %4
%else
    mova    m%4, m%2
    padd%1  m%2, m%3
    padd%1  m%2, m%3
    psub%1  m%3, m%4
    psub%1  m%3, m%4
%endif
%endmacro

%macro SUMSUBD2_AB 5
%ifnum %4
    psra%1  m%5, m%2, 1  ; %3: %3>>1
    psra%1  m%4, m%3, 1  ; %2: %2>>1
    padd%1  m%4, m%2     ; %3: %3>>1+%2
    psub%1  m%5, m%3     ; %2: %2>>1-%3
    SWAP     %2, %5
    SWAP     %3, %4
%else
    mova    %5, m%2
    mova    %4, m%3
    psra%1  m%3, 1  ; %3: %3>>1
    psra%1  m%2, 1  ; %2: %2>>1
    padd%1  m%3, %5 ; %3: %3>>1+%2
    psub%1  m%2, %4 ; %2: %2>>1-%3
%endif
%endmacro

%macro DCT4_1D 5
%ifnum %5
    SUMSUB_BADC w, %4, %1, %3, %2, %5
    SUMSUB_BA   w, %3, %4, %5
    SUMSUB2_AB  w, %1, %2, %5
    SWAP %1, %3, %4, %5, %2
%else
    SUMSUB_BADC w, %4, %1, %3, %2
    SUMSUB_BA   w, %3, %4
    mova     [%5], m%2
    SUMSUB2_AB  w, %1, [%5], %2
    SWAP %1, %3, %4, %2
%endif
%endmacro

%macro IDCT4_1D 6-7
%ifnum %6
    SUMSUBD2_AB %1, %3, %5, %7, %6
    ; %3: %3>>1-%5 %5: %3+%5>>1
    SUMSUB_BA   %1, %4, %2, %7
    ; %4: %2+%4 %2: %2-%4
    SUMSUB_BADC %1, %5, %4, %3, %2, %7
    ; %5: %2+%4 + (%3+%5>>1)
    ; %4: %2+%4 - (%3+%5>>1)
    ; %3: %2-%4 + (%3>>1-%5)
    ; %2: %2-%4 - (%3>>1-%5)
%else
%ifidn %1, w
    SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
%else
    SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
%endif
    SUMSUB_BA   %1, %4, %2
    SUMSUB_BADC %1, %5, %4, %3, %2
%endif
    SWAP %2, %5, %4
    ; %2: %2+%4 + (%3+%5>>1) row0
    ; %3: %2-%4 + (%3>>1-%5) row1
    ; %4: %2-%4 - (%3>>1-%5) row2
    ; %5: %2+%4 - (%3+%5>>1) row3
%endmacro


%macro LOAD_DIFF 5
%ifdef HIGH_BIT_DEPTH
    mova       %1, %4
    psubw      %1, %5
%elifidn %3, none
    movh       %1, %4
    movh       %2, %5
    punpcklbw  %1, %2
    punpcklbw  %2, %2
    psubw      %1, %2
%else
    movh       %1, %4
    punpcklbw  %1, %3
    movh       %2, %5
    punpcklbw  %2, %3
    psubw      %1, %2
%endif
%endmacro

%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
%if cpuflag(ssse3)
    movh       m%2, [%8+%1*FDEC_STRIDE]
    movh       m%1, [%7+%1*FENC_STRIDE]
    punpcklbw  m%1, m%2
    movh       m%3, [%8+%2*FDEC_STRIDE]
    movh       m%2, [%7+%2*FENC_STRIDE]
    punpcklbw  m%2, m%3
    movh       m%4, [%8+%3*FDEC_STRIDE]
    movh       m%3, [%7+%3*FENC_STRIDE]
    punpcklbw  m%3, m%4
    movh       m%5, [%8+%4*FDEC_STRIDE]
    movh       m%4, [%7+%4*FENC_STRIDE]
    punpcklbw  m%4, m%5
    pmaddubsw  m%1, m%6
    pmaddubsw  m%2, m%6
    pmaddubsw  m%3, m%6
    pmaddubsw  m%4, m%6
%else
    LOAD_DIFF  m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
    LOAD_DIFF  m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
    LOAD_DIFF  m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
    LOAD_DIFF  m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
%endif
%endmacro

%macro STORE_DCT 6
    movq   [%5+%6+ 0], m%1
    movq   [%5+%6+ 8], m%2
    movq   [%5+%6+16], m%3
    movq   [%5+%6+24], m%4
    movhps [%5+%6+32], m%1
    movhps [%5+%6+40], m%2
    movhps [%5+%6+48], m%3
    movhps [%5+%6+56], m%4
%endmacro

%macro STORE_IDCT 4
    movhps [r0-4*FDEC_STRIDE], %1
    movh   [r0-3*FDEC_STRIDE], %1
    movhps [r0-2*FDEC_STRIDE], %2
    movh   [r0-1*FDEC_STRIDE], %2
    movhps [r0+0*FDEC_STRIDE], %3
    movh   [r0+1*FDEC_STRIDE], %3
    movhps [r0+2*FDEC_STRIDE], %4
    movh   [r0+3*FDEC_STRIDE], %4
%endmacro

%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9]
    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3]
    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5]
%if %10
    lea %8, [%8+4*r1]
    lea %9, [%9+4*r3]
%endif
%endmacro

%macro DIFFx2 6-7
    movh       %3, %5
    punpcklbw  %3, %4
    psraw      %1, 6
    paddsw     %1, %3
    movh       %3, %6
    punpcklbw  %3, %4
    psraw      %2, 6
    paddsw     %2, %3
    packuswb   %2, %1
%endmacro

%macro STORE_DIFF 4
    movh       %2, %4
    punpcklbw  %2, %3
    psraw      %1, 6
    paddsw     %1, %2
    packuswb   %1, %1
    movh       %4, %1
%endmacro

%macro SHUFFLE_MASK_W 8
    %rep 8
        db %1*2
        db %1*2+1
        %rotate 1
    %endrep
%endmacro
x264-snapshot-20120103-2245-stable/common/x86/util.h0000644000175000001440000001324611700673342020742 0ustar  videolanusers/*****************************************************************************
 * util.h: x86 inline asm
 *****************************************************************************
 * Copyright (C) 2008-2011 x264 project
 *
 * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_X86_UTIL_H
#define X264_X86_UTIL_H

#ifdef __SSE__
#include <xmmintrin.h>

#undef M128_ZERO
#define M128_ZERO ((__m128){0,0,0,0})
#define x264_union128_t x264_union128_sse_t
typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
#if HAVE_VECTOREXT
typedef uint32_t v4si __attribute__((vector_size (16)));
#endif
#endif // __SSE__

#if HAVE_X86_INLINE_ASM && HAVE_MMX

#define x264_median_mv x264_median_mv_mmx2
static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
{
    asm(
        "movd   %1,    %%mm0 \n"
        "movd   %2,    %%mm1 \n"
        "movq   %%mm0, %%mm3 \n"
        "movd   %3,    %%mm2 \n"
        "pmaxsw %%mm1, %%mm0 \n"
        "pminsw %%mm3, %%mm1 \n"
        "pminsw %%mm2, %%mm0 \n"
        "pmaxsw %%mm1, %%mm0 \n"
        "movd   %%mm0, %0    \n"
        :"=m"(*(x264_union32_t*)dst)
        :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
    );
}

#define x264_predictor_difference x264_predictor_difference_mmx2
static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc )
{
    int sum;
    static const uint64_t pw_1 = 0x0001000100010001ULL;

    asm(
        "pxor    %%mm4, %%mm4 \n"
        "test    $1, %1       \n"
        "jnz 3f               \n"
        "movd    -8(%2,%1,4), %%mm0 \n"
        "movd    -4(%2,%1,4), %%mm3 \n"
        "psubw   %%mm3, %%mm0 \n"
        "jmp 2f               \n"
        "3:                   \n"
        "dec     %1           \n"
        "1:                   \n"
        "movq    -8(%2,%1,4), %%mm0 \n"
        "psubw   -4(%2,%1,4), %%mm0 \n"
        "2:                   \n"
        "sub     $2,    %1    \n"
        "pxor    %%mm2, %%mm2 \n"
        "psubw   %%mm0, %%mm2 \n"
        "pmaxsw  %%mm2, %%mm0 \n"
        "paddusw %%mm0, %%mm4 \n"
        "jg 1b                \n"
        "pmaddwd %4, %%mm4    \n"
        "pshufw $14, %%mm4, %%mm0 \n"
        "paddd   %%mm0, %%mm4 \n"
        "movd    %%mm4, %0    \n"
        :"=r"(sum), "+r"(i_mvc)
        :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
    );
    return sum;
}

#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2
static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop)
{
    static const uint64_t pb_2    = 0x0202020202020202ULL;
    static const uint64_t pb_32   = 0x2020202020202020ULL;
    static const uint64_t pb_33   = 0x2121212121212121ULL;
    int amvd;
    asm(
        "movd         %1, %%mm0 \n"
        "movd         %2, %%mm1 \n"
        "paddusb   %%mm1, %%mm0 \n"
        "pminub       %5, %%mm0 \n"
        "pxor      %%mm2, %%mm2 \n"
        "movq      %%mm0, %%mm1 \n"
        "pcmpgtb      %3, %%mm0 \n"
        "pcmpgtb      %4, %%mm1 \n"
        "psubb     %%mm0, %%mm2 \n"
        "psubb     %%mm1, %%mm2 \n"
        "movd      %%mm2, %0    \n"
        :"=r"(amvd)
        :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
         "m"(pb_2),"m"(pb_32),"m"(pb_33)
    );
    return amvd;
}

#define x264_predictor_roundclip x264_predictor_roundclip_mmx2
static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
{
    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
    static const uint64_t pw_2 = 0x0002000200020002ULL;
    intptr_t i = i_mvc;
    asm(
        "movd    %2, %%mm5       \n"
        "movd    %3, %%mm6       \n"
        "movq    %4, %%mm7       \n"
        "punpckldq %%mm5, %%mm5  \n"
        "punpckldq %%mm6, %%mm6  \n"
        "test $1, %0             \n"
        "jz 1f                   \n"
        "movd -4(%6,%0,4), %%mm0 \n"
        "paddw %%mm7, %%mm0      \n"
        "psraw $2, %%mm0         \n"
        "pmaxsw %%mm5, %%mm0     \n"
        "pminsw %%mm6, %%mm0     \n"
        "movd %%mm0, -4(%5,%0,4) \n"
        "dec %0                  \n"
        "jz 2f                   \n"
        "1:                      \n"
        "movq -8(%6,%0,4), %%mm0 \n"
        "paddw %%mm7, %%mm0      \n"
        "psraw $2, %%mm0         \n"
        "pmaxsw %%mm5, %%mm0     \n"
        "pminsw %%mm6, %%mm0     \n"
        "movq %%mm0, -8(%5,%0,4) \n"
        "sub $2, %0              \n"
        "jnz 1b                  \n"
        "2:                      \n"
        :"+r"(i), "=m"(M64( dst ))
        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc ))
    );
}

#endif

#endif
x264-snapshot-20120103-2245-stable/common/x86/sad16-a.asm0000644000175000001440000002246211700673342021452 0ustar  videolanusers;*****************************************************************************
;* sad16-a.asm: x86 high depth sad functions
;*****************************************************************************
;* Copyright (C) 2010-2011 x264 project
;*
;* Authors: Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION .text

cextern pw_1

;=============================================================================
; SAD MMX
;=============================================================================

%macro SAD_INC_1x16P_MMX 0
    movu    m1, [r0+ 0]
    movu    m2, [r0+ 8]
    movu    m3, [r0+16]
    movu    m4, [r0+24]
    psubw   m1, [r2+ 0]
    psubw   m2, [r2+ 8]
    psubw   m3, [r2+16]
    psubw   m4, [r2+24]
    ABSW2   m1, m2, m1, m2, m5, m6
    ABSW2   m3, m4, m3, m4, m7, m5
    lea     r0, [r0+2*r1]
    lea     r2, [r2+2*r3]
    paddw   m1, m2
    paddw   m3, m4
    paddw   m0, m1
    paddw   m0, m3
%endmacro

%macro SAD_INC_2x8P_MMX 0
    movu    m1, [r0+0]
    movu    m2, [r0+8]
    movu    m3, [r0+2*r1+0]
    movu    m4, [r0+2*r1+8]
    psubw   m1, [r2+0]
    psubw   m2, [r2+8]
    psubw   m3, [r2+2*r3+0]
    psubw   m4, [r2+2*r3+8]
    ABSW2   m1, m2, m1, m2, m5, m6
    ABSW2   m3, m4, m3, m4, m7, m5
    lea     r0, [r0+4*r1]
    lea     r2, [r2+4*r3]
    paddw   m1, m2
    paddw   m3, m4
    paddw   m0, m1
    paddw   m0, m3
%endmacro

%macro SAD_INC_2x4P_MMX 0
    movu    m1, [r0]
    movu    m2, [r0+2*r1]
    psubw   m1, [r2]
    psubw   m2, [r2+2*r3]
    ABSW2   m1, m2, m1, m2, m3, m4
    lea     r0, [r0+4*r1]
    lea     r2, [r2+4*r3]
    paddw   m0, m1
    paddw   m0, m2
%endmacro

;-----------------------------------------------------------------------------
; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
%macro SAD_MMX 3
cglobal pixel_sad_%1x%2, 4,4
    pxor    m0, m0
%rep %2/%3
    SAD_INC_%3x%1P_MMX
%endrep
%if %1*%2 == 256
    HADDUW  m0, m1
%else
    HADDW   m0, m1
%endif
    movd   eax, m0
    RET
%endmacro

INIT_MMX mmx2
SAD_MMX 16, 16, 1
SAD_MMX 16,  8, 1
SAD_MMX  8, 16, 2
SAD_MMX  8,  8, 2
SAD_MMX  8,  4, 2
SAD_MMX  4,  8, 2
SAD_MMX  4,  4, 2
INIT_MMX ssse3
SAD_MMX  4,  8, 2
SAD_MMX  4,  4, 2

;=============================================================================
; SAD XMM
;=============================================================================

%macro SAD_INC_2x16P_XMM 0
    movu    m1, [r2+ 0]
    movu    m2, [r2+16]
    movu    m3, [r2+2*r3+ 0]
    movu    m4, [r2+2*r3+16]
    psubw   m1, [r0+ 0]
    psubw   m2, [r0+16]
    psubw   m3, [r0+2*r1+ 0]
    psubw   m4, [r0+2*r1+16]
    ABSW2   m1, m2, m1, m2, m5, m6
    lea     r0, [r0+4*r1]
    lea     r2, [r2+4*r3]
    ABSW2   m3, m4, m3, m4, m7, m5
    paddw   m1, m2
    paddw   m3, m4
    paddw   m0, m1
    paddw   m0, m3
%endmacro

%macro SAD_INC_2x8P_XMM 0
    movu    m1, [r2]
    movu    m2, [r2+2*r3]
    psubw   m1, [r0]
    psubw   m2, [r0+2*r1]
    ABSW2   m1, m2, m1, m2, m3, m4
    lea     r0, [r0+4*r1]
    lea     r2, [r2+4*r3]
    paddw   m0, m1
    paddw   m0, m2
%endmacro

;-----------------------------------------------------------------------------
; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
%macro SAD_XMM 2
cglobal pixel_sad_%1x%2, 4,4,8
    pxor    m0, m0
%rep %2/2
    SAD_INC_2x%1P_XMM
%endrep
    HADDW   m0, m1
    movd   eax, m0
    RET
%endmacro

INIT_XMM sse2
SAD_XMM 16, 16
SAD_XMM 16,  8
SAD_XMM  8, 16
SAD_XMM  8,  8
SAD_XMM  8,  4
INIT_XMM sse2, aligned
SAD_XMM 16, 16
SAD_XMM 16,  8
SAD_XMM  8, 16
SAD_XMM  8,  8
INIT_XMM ssse3
SAD_XMM 16, 16
SAD_XMM 16,  8
SAD_XMM  8, 16
SAD_XMM  8,  8
SAD_XMM  8,  4
INIT_XMM ssse3, aligned
SAD_XMM 16, 16
SAD_XMM 16,  8
SAD_XMM  8, 16
SAD_XMM  8,  8

;=============================================================================
; SAD x3/x4
;=============================================================================

%macro SAD_X3_INC_P 0
    add     r0, 4*FENC_STRIDE
    lea     r1, [r1+4*r4]
    lea     r2, [r2+4*r4]
    lea     r3, [r3+4*r4]
%endmacro

%macro SAD_X3_ONE_START 0
    mova    m3, [r0]
    movu    m0, [r1]
    movu    m1, [r2]
    movu    m2, [r3]
    psubw   m0, m3
    psubw   m1, m3
    psubw   m2, m3
    ABSW2   m0, m1, m0, m1, m4, m5
    ABSW    m2, m2, m6
%endmacro

%macro SAD_X3_ONE 2
    mova    m6, [r0+%1]
    movu    m3, [r1+%2]
    movu    m4, [r2+%2]
    movu    m5, [r3+%2]
    psubw   m3, m6
    psubw   m4, m6
    psubw   m5, m6
    ABSW2   m3, m4, m3, m4, m7, m6
    ABSW    m5, m5, m6
    paddw   m0, m3
    paddw   m1, m4
    paddw   m2, m5
%endmacro

%macro SAD_X3_END 2
%if mmsize == 8 && %1*%2 == 256
    HADDUW   m0, m3
    HADDUW   m1, m4
    HADDUW   m2, m5
%else
    HADDW    m0, m3
    HADDW    m1, m4
    HADDW    m2, m5
%endif
%ifdef UNIX64
    movd [r5+0], m0
    movd [r5+4], m1
    movd [r5+8], m2
%else
    mov      r0, r5mp
    movd [r0+0], m0
    movd [r0+4], m1
    movd [r0+8], m2
%endif
    RET
%endmacro

%macro SAD_X4_INC_P 0
    add     r0, 4*FENC_STRIDE
    lea     r1, [r1+4*r5]
    lea     r2, [r2+4*r5]
    lea     r3, [r3+4*r5]
    lea     r4, [r4+4*r5]
%endmacro

%macro SAD_X4_ONE_START 0
    mova    m4, [r0]
    movu    m0, [r1]
    movu    m1, [r2]
    movu    m2, [r3]
    movu    m3, [r4]
    psubw   m0, m4
    psubw   m1, m4
    psubw   m2, m4
    psubw   m3, m4
    ABSW2   m0, m1, m0, m1, m5, m6
    ABSW2   m2, m3, m2, m3, m4, m7
%endmacro

%macro SAD_X4_ONE 2
    mova    m4, [r0+%1]
    movu    m5, [r1+%2]
    movu    m6, [r2+%2]
%if num_mmregs > 8
    movu    m7, [r3+%2]
    movu    m8, [r4+%2]
    psubw   m5, m4
    psubw   m6, m4
    psubw   m7, m4
    psubw   m8, m4
    ABSW2   m5, m6, m5, m6, m9, m10
    ABSW2   m7, m8, m7, m8, m9, m10
    paddw   m0, m5
    paddw   m1, m6
    paddw   m2, m7
    paddw   m3, m8
%elif cpuflag(ssse3)
    movu    m7, [r3+%2]
    psubw   m5, m4
    psubw   m6, m4
    psubw   m7, m4
    movu    m4, [r4+%2]
    pabsw   m5, m5
    psubw   m4, [r0+%1]
    pabsw   m6, m6
    pabsw   m7, m7
    pabsw   m4, m4
    paddw   m0, m5
    paddw   m1, m6
    paddw   m2, m7
    paddw   m3, m4
%else ; num_mmregs == 8 && !ssse3
    psubw   m5, m4
    psubw   m6, m4
    ABSW    m5, m5, m7
    ABSW    m6, m6, m7
    paddw   m0, m5
    paddw   m1, m6
    movu    m5, [r3+%2]
    movu    m6, [r4+%2]
    psubw   m5, m4
    psubw   m6, m4
    ABSW2   m5, m6, m5, m6, m7, m4
    paddw   m2, m5
    paddw   m3, m6
%endif
%endmacro

%macro SAD_X4_END 2
%if mmsize == 8 && %1*%2 == 256
    HADDUW    m0, m4
    HADDUW    m1, m5
    HADDUW    m2, m6
    HADDUW    m3, m7
%else
    HADDW     m0, m4
    HADDW     m1, m5
    HADDW     m2, m6
    HADDW     m3, m7
%endif
    mov       r0, r6mp
    movd [r0+ 0], m0
    movd [r0+ 4], m1
    movd [r0+ 8], m2
    movd [r0+12], m3
    RET
%endmacro

%macro SAD_X_2xNP 4
    %assign x %3
%rep %4
    SAD_X%1_ONE x*mmsize, x*mmsize
    SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
    %assign x x+1
%endrep
%endmacro

;-----------------------------------------------------------------------------
; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
;                        uint16_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
    %assign regnum %1+1
    %xdefine STRIDE r %+ regnum
%ifdef WIN64
    movsxd STRIDE, STRIDE %+ d
%endif
    mov     r6, %3/2-1
    SAD_X%1_ONE_START
    SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
    SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
.loop:
    SAD_X%1_INC_P
    SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
    dec     r6
    jg .loop
%if %1 == 4
    mov     r6, r6m
%endif
    SAD_X%1_END %2, %3
%endmacro

INIT_MMX mmx2
%define XMM_REGS 0
SAD_X 3, 16, 16
SAD_X 3, 16,  8
SAD_X 3,  8, 16
SAD_X 3,  8,  8
SAD_X 3,  8,  4
SAD_X 3,  4,  8
SAD_X 3,  4,  4
SAD_X 4, 16, 16
SAD_X 4, 16,  8
SAD_X 4,  8, 16
SAD_X 4,  8,  8
SAD_X 4,  8,  4
SAD_X 4,  4,  8
SAD_X 4,  4,  4
INIT_MMX ssse3
SAD_X 3,  4,  8
SAD_X 3,  4,  4
SAD_X 4,  4,  8
SAD_X 4,  4,  4
INIT_XMM ssse3
%define XMM_REGS 9
SAD_X 3, 16, 16
SAD_X 3, 16,  8
SAD_X 3,  8, 16
SAD_X 3,  8,  8
SAD_X 3,  8,  4
SAD_X 4, 16, 16
SAD_X 4, 16,  8
SAD_X 4,  8, 16
SAD_X 4,  8,  8
SAD_X 4,  8,  4
INIT_XMM sse2
%define XMM_REGS 11
SAD_X 3, 16, 16
SAD_X 3, 16,  8
SAD_X 3,  8, 16
SAD_X 3,  8,  8
SAD_X 3,  8,  4
SAD_X 4, 16, 16
SAD_X 4, 16,  8
SAD_X 4,  8, 16
SAD_X 4,  8,  8
SAD_X 4,  8,  4
x264-snapshot-20120103-2245-stable/common/x86/sad-a.asm0000644000175000001440000011223111700673342021275 0ustar  videolanusers;*****************************************************************************
;* sad-a.asm: x86 sad functions
;*****************************************************************************
;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*          Laurent Aimar <fenrir@via.ecp.fr>
;*          Alex Izvorski <aizvorksi@gmail.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION .text

cextern pb_3
cextern pb_shuf8x8c
cextern pw_8
cextern sw_64

;=============================================================================
; SAD MMX
;=============================================================================

%macro SAD_INC_2x16P 0
    movq    mm1,    [r0]
    movq    mm2,    [r0+8]
    movq    mm3,    [r0+r1]
    movq    mm4,    [r0+r1+8]
    psadbw  mm1,    [r2]
    psadbw  mm2,    [r2+8]
    psadbw  mm3,    [r2+r3]
    psadbw  mm4,    [r2+r3+8]
    lea     r0,     [r0+2*r1]
    paddw   mm1,    mm2
    paddw   mm3,    mm4
    lea     r2,     [r2+2*r3]
    paddw   mm0,    mm1
    paddw   mm0,    mm3
%endmacro

%macro SAD_INC_2x8P 0
    movq    mm1,    [r0]
    movq    mm2,    [r0+r1]
    psadbw  mm1,    [r2]
    psadbw  mm2,    [r2+r3]
    lea     r0,     [r0+2*r1]
    paddw   mm0,    mm1
    paddw   mm0,    mm2
    lea     r2,     [r2+2*r3]
%endmacro

%macro SAD_INC_2x4P 0
    movd    mm1,    [r0]
    movd    mm2,    [r2]
    punpckldq mm1,  [r0+r1]
    punpckldq mm2,  [r2+r3]
    psadbw  mm1,    mm2
    paddw   mm0,    mm1
    lea     r0,     [r0+2*r1]
    lea     r2,     [r2+2*r3]
%endmacro

;-----------------------------------------------------------------------------
; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SAD 2
cglobal pixel_sad_%1x%2_mmx2, 4,4
    pxor    mm0, mm0
%rep %2/2
    SAD_INC_2x%1P
%endrep
    movd    eax, mm0
    RET
%endmacro

SAD 16, 16
SAD 16,  8
SAD  8, 16
SAD  8,  8
SAD  8,  4
SAD  4, 16
SAD  4,  8
SAD  4,  4



;=============================================================================
; SAD XMM
;=============================================================================

%macro SAD_END_SSE2 0
    movhlps m1, m0
    paddw   m0, m1
    movd   eax, m0
    RET
%endmacro

%macro SAD_W16 0
;-----------------------------------------------------------------------------
; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x16, 4,4,8
    movu    m0, [r2]
    movu    m1, [r2+r3]
    lea     r2, [r2+2*r3]
    movu    m2, [r2]
    movu    m3, [r2+r3]
    lea     r2, [r2+2*r3]
    psadbw  m0, [r0]
    psadbw  m1, [r0+r1]
    lea     r0, [r0+2*r1]
    movu    m4, [r2]
    paddw   m0, m1
    psadbw  m2, [r0]
    psadbw  m3, [r0+r1]
    lea     r0, [r0+2*r1]
    movu    m5, [r2+r3]
    lea     r2, [r2+2*r3]
    paddw   m2, m3
    movu    m6, [r2]
    movu    m7, [r2+r3]
    lea     r2, [r2+2*r3]
    paddw   m0, m2
    psadbw  m4, [r0]
    psadbw  m5, [r0+r1]
    lea     r0, [r0+2*r1]
    movu    m1, [r2]
    paddw   m4, m5
    psadbw  m6, [r0]
    psadbw  m7, [r0+r1]
    lea     r0, [r0+2*r1]
    movu    m2, [r2+r3]
    lea     r2, [r2+2*r3]
    paddw   m6, m7
    movu    m3, [r2]
    paddw   m0, m4
    movu    m4, [r2+r3]
    lea     r2, [r2+2*r3]
    paddw   m0, m6
    psadbw  m1, [r0]
    psadbw  m2, [r0+r1]
    lea     r0, [r0+2*r1]
    movu    m5, [r2]
    paddw   m1, m2
    psadbw  m3, [r0]
    psadbw  m4, [r0+r1]
    lea     r0, [r0+2*r1]
    movu    m6, [r2+r3]
    lea     r2, [r2+2*r3]
    paddw   m3, m4
    movu    m7, [r2]
    paddw   m0, m1
    movu    m1, [r2+r3]
    paddw   m0, m3
    psadbw  m5, [r0]
    psadbw  m6, [r0+r1]
    lea     r0, [r0+2*r1]
    paddw   m5, m6
    psadbw  m7, [r0]
    psadbw  m1, [r0+r1]
    paddw   m7, m1
    paddw   m0, m5
    paddw   m0, m7
    SAD_END_SSE2

;-----------------------------------------------------------------------------
; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x8, 4,4
    movu    m0, [r2]
    movu    m2, [r2+r3]
    lea     r2, [r2+2*r3]
    movu    m3, [r2]
    movu    m4, [r2+r3]
    psadbw  m0, [r0]
    psadbw  m2, [r0+r1]
    lea     r0, [r0+2*r1]
    psadbw  m3, [r0]
    psadbw  m4, [r0+r1]
    lea     r0, [r0+2*r1]
    lea     r2, [r2+2*r3]
    paddw   m0, m2
    paddw   m3, m4
    paddw   m0, m3
    movu    m1, [r2]
    movu    m2, [r2+r3]
    lea     r2, [r2+2*r3]
    movu    m3, [r2]
    movu    m4, [r2+r3]
    psadbw  m1, [r0]
    psadbw  m2, [r0+r1]
    lea     r0, [r0+2*r1]
    psadbw  m3, [r0]
    psadbw  m4, [r0+r1]
    lea     r0, [r0+2*r1]
    lea     r2, [r2+2*r3]
    paddw   m1, m2
    paddw   m3, m4
    paddw   m0, m1
    paddw   m0, m3
    SAD_END_SSE2
%endmacro

INIT_XMM sse2
SAD_W16
INIT_XMM sse3
SAD_W16
INIT_XMM sse2, aligned
SAD_W16

%macro SAD_INC_4x8P_SSE 1
    movq    m1, [r0]
    movq    m2, [r0+r1]
    lea     r0, [r0+2*r1]
    movq    m3, [r2]
    movq    m4, [r2+r3]
    lea     r2, [r2+2*r3]
    movhps  m1, [r0]
    movhps  m2, [r0+r1]
    movhps  m3, [r2]
    movhps  m4, [r2+r3]
    lea     r0, [r0+2*r1]
    psadbw  m1, m3
    psadbw  m2, m4
    lea     r2, [r2+2*r3]
%if %1
    paddw   m0, m1
%else
    SWAP     0, 1
%endif
    paddw   m0, m2
%endmacro

INIT_XMM
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
cglobal pixel_sad_8x16_sse2, 4,4
    SAD_INC_4x8P_SSE 0
    SAD_INC_4x8P_SSE 1
    SAD_INC_4x8P_SSE 1
    SAD_INC_4x8P_SSE 1
    SAD_END_SSE2
    RET

;-----------------------------------------------------------------------------
; void pixel_vsad( pixel *src, int stride );
;-----------------------------------------------------------------------------

%ifndef ARCH_X86_64
INIT_MMX
cglobal pixel_vsad_mmx2, 3,3
    mova      m0, [r0]
    mova      m1, [r0+8]
    mova      m2, [r0+r1]
    mova      m3, [r0+r1+8]
    lea       r0, [r0+r1*2]
    psadbw    m0, m2
    psadbw    m1, m3
    paddw     m0, m1
    sub      r2d, 2
    je .end
.loop:
    mova      m4, [r0]
    mova      m5, [r0+8]
    mova      m6, [r0+r1]
    mova      m7, [r0+r1+8]
    lea       r0, [r0+r1*2]
    psadbw    m2, m4
    psadbw    m3, m5
    psadbw    m4, m6
    psadbw    m5, m7
    paddw     m0, m2
    paddw     m0, m3
    paddw     m0, m4
    paddw     m0, m5
    mova      m2, m6
    mova      m3, m7
    sub      r2d, 2
    jg .loop
.end:
    movd     eax, m0
    RET
%endif

INIT_XMM
cglobal pixel_vsad_sse2, 3,3
    mova      m0, [r0]
    mova      m1, [r0+r1]
    lea       r0, [r0+r1*2]
    psadbw    m0, m1
    sub      r2d, 2
    je .end
.loop:
    mova      m2, [r0]
    mova      m3, [r0+r1]
    lea       r0, [r0+r1*2]
    psadbw    m1, m2
    psadbw    m2, m3
    paddw     m0, m1
    paddw     m0, m2
    mova      m1, m3
    sub      r2d, 2
    jg .loop
.end:
    movhlps   m1, m0
    paddw     m0, m1
    movd     eax, m0
    RET

;-----------------------------------------------------------------------------
; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------

cglobal intra_sad_x3_4x4_mmx2, 3,3
    pxor      mm7, mm7
    movd      mm0, [r1-FDEC_STRIDE]
    movd      mm1, [r0+FENC_STRIDE*0]
    movd      mm2, [r0+FENC_STRIDE*2]
    punpckldq mm0, mm0
    punpckldq mm1, [r0+FENC_STRIDE*1]
    punpckldq mm2, [r0+FENC_STRIDE*3]
    movq      mm6, mm0
    movq      mm3, mm1
    psadbw    mm3, mm0
    psadbw    mm0, mm2
    paddw     mm0, mm3
    movd     [r2], mm0 ;V prediction cost
    movd      mm3, [r1+FDEC_STRIDE*0-4]
    movd      mm0, [r1+FDEC_STRIDE*1-4]
    movd      mm4, [r1+FDEC_STRIDE*2-4]
    movd      mm5, [r1+FDEC_STRIDE*3-4]
    punpcklbw mm3, mm0
    punpcklbw mm4, mm5
    movq      mm5, mm3
    punpckhwd mm5, mm4
    punpckhdq mm5, mm6
    psadbw    mm5, mm7
    punpckhbw mm3, mm3
    punpckhbw mm4, mm4
    punpckhwd mm3, mm3
    punpckhwd mm4, mm4
    psraw     mm5, 2
    pavgw     mm5, mm7
    punpcklbw mm5, mm5
    pshufw    mm5, mm5, 0 ;DC prediction
    movq      mm6, mm5
    psadbw    mm5, mm1
    psadbw    mm6, mm2
    psadbw    mm1, mm3
    psadbw    mm2, mm4
    paddw     mm5, mm6
    paddw     mm1, mm2
    movd   [r2+8], mm5 ;DC prediction cost
    movd   [r2+4], mm1 ;H prediction cost
    RET

;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
;-----------------------------------------------------------------------------

;m0 = DC
;m6 = V
;m7 = H
;m1 = DC score
;m2 = V score
;m3 = H score
;m5 = pixel row
;m4 = temp

%macro INTRA_SAD_HVDC_ITER 2
    movq      m5, [r0+FENC_STRIDE*%1]
    movq      m4, m5
    psadbw    m4, m0
%if %1
    paddw     m1, m4
%else
    SWAP       1, 4
%endif
    movq      m4, m5
    psadbw    m4, m6
%if %1
    paddw     m2, m4
%else
    SWAP       2, 4
%endif
    pshufw    m4, m7, %2
    psadbw    m5, m4
%if %1
    paddw     m3, m5
%else
    SWAP       3, 5
%endif
%endmacro

INIT_MMX
cglobal intra_sad_x3_8x8_mmx2, 3,3
    movq      m7, [r1+7]
    pxor      m0, m0
    movq      m6, [r1+16]  ;V prediction
    pxor      m1, m1
    psadbw    m0, m7
    psadbw    m1, m6
    paddw     m0, m1
    paddw     m0, [pw_8]
    psrlw     m0, 4
    punpcklbw m0, m0
    pshufw    m0, m0, q0000 ;DC prediction
    punpckhbw m7, m7
    INTRA_SAD_HVDC_ITER 0, q3333
    INTRA_SAD_HVDC_ITER 1, q2222
    INTRA_SAD_HVDC_ITER 2, q1111
    INTRA_SAD_HVDC_ITER 3, q0000
    movq      m7, [r1+7]
    punpcklbw m7, m7
    INTRA_SAD_HVDC_ITER 4, q3333
    INTRA_SAD_HVDC_ITER 5, q2222
    INTRA_SAD_HVDC_ITER 6, q1111
    INTRA_SAD_HVDC_ITER 7, q0000
    movd  [r2+0], m2
    movd  [r2+4], m3
    movd  [r2+8], m1
    RET

;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------

%macro INTRA_SAD_HV_ITER 1
%if cpuflag(ssse3)
    movd        m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
    movd        m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
    pshufb      m1, m7
    pshufb      m3, m7
%else
    movq        m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
    movq        m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
    punpckhbw   m1, m1
    punpckhbw   m3, m3
    pshufw      m1, m1, q3333
    pshufw      m3, m3, q3333
%endif
    movq        m4, [r0 + FENC_STRIDE*(%1+0)]
    movq        m5, [r0 + FENC_STRIDE*(%1+1)]
    psadbw      m1, m4
    psadbw      m3, m5
    psadbw      m4, m6
    psadbw      m5, m6
    paddw       m1, m3
    paddw       m4, m5
%if %1
    paddw       m0, m1
    paddw       m2, m4
%else
    SWAP 0,1
    SWAP 2,4
%endif
%endmacro

%macro INTRA_SAD_8x8C 0
cglobal intra_sad_x3_8x8c, 3,3
    movq        m6, [r1 - FDEC_STRIDE]
    add         r1, FDEC_STRIDE*4
%if cpuflag(ssse3)
    movq        m7, [pb_3]
%endif
    INTRA_SAD_HV_ITER 0
    INTRA_SAD_HV_ITER 2
    INTRA_SAD_HV_ITER 4
    INTRA_SAD_HV_ITER 6
    movd    [r2+4], m0
    movd    [r2+8], m2
    pxor        m7, m7
    movq        m2, [r1 + FDEC_STRIDE*-4 - 8]
    movq        m4, [r1 + FDEC_STRIDE*-2 - 8]
    movq        m3, [r1 + FDEC_STRIDE* 0 - 8]
    movq        m5, [r1 + FDEC_STRIDE* 2 - 8]
    punpckhbw   m2, [r1 + FDEC_STRIDE*-3 - 8]
    punpckhbw   m4, [r1 + FDEC_STRIDE*-1 - 8]
    punpckhbw   m3, [r1 + FDEC_STRIDE* 1 - 8]
    punpckhbw   m5, [r1 + FDEC_STRIDE* 3 - 8]
    punpckhbw   m2, m4
    punpckhbw   m3, m5
    psrlq       m2, 32
    psrlq       m3, 32
    psadbw      m2, m7 ; s2
    psadbw      m3, m7 ; s3
    movq        m1, m6
    SWAP        0, 6
    punpckldq   m0, m7
    punpckhdq   m1, m7
    psadbw      m0, m7 ; s0
    psadbw      m1, m7 ; s1
    punpcklwd   m0, m1
    punpcklwd   m2, m3
    punpckldq   m0, m2 ;s0 s1 s2 s3
    pshufw      m3, m0, q3312 ;s2,s1,s3,s3
    pshufw      m0, m0, q1310 ;s0,s1,s3,s1
    paddw       m0, m3
    psrlw       m0, 2
    pavgw       m0, m7 ; s0+s2, s1, s3, s1+s3
%if cpuflag(ssse3)
    movq2dq   xmm0, m0
    pshufb    xmm0, [pb_shuf8x8c]
    movq      xmm1, [r0+FENC_STRIDE*0]
    movq      xmm2, [r0+FENC_STRIDE*1]
    movq      xmm3, [r0+FENC_STRIDE*2]
    movq      xmm4, [r0+FENC_STRIDE*3]
    movhps    xmm1, [r0+FENC_STRIDE*4]
    movhps    xmm2, [r0+FENC_STRIDE*5]
    movhps    xmm3, [r0+FENC_STRIDE*6]
    movhps    xmm4, [r0+FENC_STRIDE*7]
    psadbw    xmm1, xmm0
    psadbw    xmm2, xmm0
    psadbw    xmm3, xmm0
    psadbw    xmm4, xmm0
    paddw     xmm1, xmm2
    paddw     xmm1, xmm3
    paddw     xmm1, xmm4
    movhlps   xmm0, xmm1
    paddw     xmm1, xmm0
    movd      [r2], xmm1
%else
    packuswb    m0, m0
    punpcklbw   m0, m0
    movq        m1, m0
    punpcklbw   m0, m0 ; 4x dc0 4x dc1
    punpckhbw   m1, m1 ; 4x dc2 4x dc3
    movq        m2, [r0+FENC_STRIDE*0]
    movq        m3, [r0+FENC_STRIDE*1]
    movq        m4, [r0+FENC_STRIDE*2]
    movq        m5, [r0+FENC_STRIDE*3]
    movq        m6, [r0+FENC_STRIDE*4]
    movq        m7, [r0+FENC_STRIDE*5]
    psadbw      m2, m0
    psadbw      m3, m0
    psadbw      m4, m0
    psadbw      m5, m0
    movq        m0, [r0+FENC_STRIDE*6]
    psadbw      m6, m1
    psadbw      m7, m1
    psadbw      m0, m1
    psadbw      m1, [r0+FENC_STRIDE*7]
    paddw       m2, m3
    paddw       m4, m5
    paddw       m6, m7
    paddw       m0, m1
    paddw       m2, m4
    paddw       m6, m0
    paddw       m2, m6
    movd      [r2], m2
%endif
    RET
%endmacro

INIT_MMX mmx2
INTRA_SAD_8x8C
INIT_MMX ssse3
INTRA_SAD_8x8C


;-----------------------------------------------------------------------------
; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------

;xmm7: DC prediction    xmm6: H prediction  xmm5: V prediction
;xmm4: DC pred score    xmm3: H pred score  xmm2: V pred score
%macro INTRA_SAD16 0
cglobal intra_sad_x3_16x16, 3,5,8
    pxor    mm0, mm0
    pxor    mm1, mm1
    psadbw  mm0, [r1-FDEC_STRIDE+0]
    psadbw  mm1, [r1-FDEC_STRIDE+8]
    paddw   mm0, mm1
    movd    r3d, mm0
%if cpuflag(ssse3)
    mova  m1, [pb_3]
%endif
%assign x 0
%rep 16
    movzx   r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
%if (x&3)==3 && x!=15
    add      r1, FDEC_STRIDE*4
%endif
    add     r3d, r4d
%assign x x+1
%endrep
    sub      r1, FDEC_STRIDE*12
    add     r3d, 16
    shr     r3d, 5
    imul    r3d, 0x01010101
    movd    m7, r3d
    mova    m5, [r1-FDEC_STRIDE]
%if mmsize==16
    pshufd  m7, m7, 0
%else
    mova    m1, [r1-FDEC_STRIDE+8]
    punpckldq m7, m7
%endif
    pxor    m4, m4
    pxor    m3, m3
    pxor    m2, m2
    mov     r3d, 15*FENC_STRIDE
.vloop:
    SPLATB_LOAD m6, r1+r3*2-1, m1
    mova    m0, [r0+r3]
    psadbw  m0, m7
    paddw   m4, m0
    mova    m0, [r0+r3]
    psadbw  m0, m5
    paddw   m2, m0
%if mmsize==8
    mova    m0, [r0+r3]
    psadbw  m0, m6
    paddw   m3, m0
    mova    m0, [r0+r3+8]
    psadbw  m0, m7
    paddw   m4, m0
    mova    m0, [r0+r3+8]
    psadbw  m0, m1
    paddw   m2, m0
    psadbw  m6, [r0+r3+8]
    paddw   m3, m6
%else
    psadbw  m6, [r0+r3]
    paddw   m3, m6
%endif
    add     r3d, -FENC_STRIDE
    jge .vloop
%if mmsize==16
    pslldq  m3, 4
    por     m3, m2
    movhlps m1, m3
    paddw   m3, m1
    movq  [r2+0], m3
    movhlps m1, m4
    paddw   m4, m1
%else
    movd  [r2+0], m2
    movd  [r2+4], m3
%endif
    movd  [r2+8], m4
    RET
%endmacro

INIT_MMX mmx2
INTRA_SAD16
INIT_XMM sse2
INTRA_SAD16
INIT_XMM ssse3
INTRA_SAD16



;=============================================================================
; SAD x3/x4 MMX
;=============================================================================

%macro SAD_X3_START_1x8P 0
    movq    mm3,    [r0]
    movq    mm0,    [r1]
    movq    mm1,    [r2]
    movq    mm2,    [r3]
    psadbw  mm0,    mm3
    psadbw  mm1,    mm3
    psadbw  mm2,    mm3
%endmacro

%macro SAD_X3_1x8P 2
    movq    mm3,    [r0+%1]
    movq    mm4,    [r1+%2]
    movq    mm5,    [r2+%2]
    movq    mm6,    [r3+%2]
    psadbw  mm4,    mm3
    psadbw  mm5,    mm3
    psadbw  mm6,    mm3
    paddw   mm0,    mm4
    paddw   mm1,    mm5
    paddw   mm2,    mm6
%endmacro

%macro SAD_X3_START_2x4P 3
    movd      mm3,  [r0]
    movd      %1,   [r1]
    movd      %2,   [r2]
    movd      %3,   [r3]
    punpckldq mm3,  [r0+FENC_STRIDE]
    punpckldq %1,   [r1+r4]
    punpckldq %2,   [r2+r4]
    punpckldq %3,   [r3+r4]
    psadbw    %1,   mm3
    psadbw    %2,   mm3
    psadbw    %3,   mm3
%endmacro

%macro SAD_X3_2x16P 1
%if %1
    SAD_X3_START_1x8P
%else
    SAD_X3_1x8P 0, 0
%endif
    SAD_X3_1x8P 8, 8
    SAD_X3_1x8P FENC_STRIDE, r4
    SAD_X3_1x8P FENC_STRIDE+8, r4+8
    add     r0, 2*FENC_STRIDE
    lea     r1, [r1+2*r4]
    lea     r2, [r2+2*r4]
    lea     r3, [r3+2*r4]
%endmacro

%macro SAD_X3_2x8P 1
%if %1
    SAD_X3_START_1x8P
%else
    SAD_X3_1x8P 0, 0
%endif
    SAD_X3_1x8P FENC_STRIDE, r4
    add     r0, 2*FENC_STRIDE
    lea     r1, [r1+2*r4]
    lea     r2, [r2+2*r4]
    lea     r3, [r3+2*r4]
%endmacro

%macro SAD_X3_2x4P 1
%if %1
    SAD_X3_START_2x4P mm0, mm1, mm2
%else
    SAD_X3_START_2x4P mm4, mm5, mm6
    paddw     mm0,  mm4
    paddw     mm1,  mm5
    paddw     mm2,  mm6
%endif
    add     r0, 2*FENC_STRIDE
    lea     r1, [r1+2*r4]
    lea     r2, [r2+2*r4]
    lea     r3, [r3+2*r4]
%endmacro

%macro SAD_X4_START_1x8P 0
    movq    mm7,    [r0]
    movq    mm0,    [r1]
    movq    mm1,    [r2]
    movq    mm2,    [r3]
    movq    mm3,    [r4]
    psadbw  mm0,    mm7
    psadbw  mm1,    mm7
    psadbw  mm2,    mm7
    psadbw  mm3,    mm7
%endmacro

%macro SAD_X4_1x8P 2
    movq    mm7,    [r0+%1]
    movq    mm4,    [r1+%2]
    movq    mm5,    [r2+%2]
    movq    mm6,    [r3+%2]
    psadbw  mm4,    mm7
    psadbw  mm5,    mm7
    psadbw  mm6,    mm7
    psadbw  mm7,    [r4+%2]
    paddw   mm0,    mm4
    paddw   mm1,    mm5
    paddw   mm2,    mm6
    paddw   mm3,    mm7
%endmacro

%macro SAD_X4_START_2x4P 0
    movd      mm7,  [r0]
    movd      mm0,  [r1]
    movd      mm1,  [r2]
    movd      mm2,  [r3]
    movd      mm3,  [r4]
    punpckldq mm7,  [r0+FENC_STRIDE]
    punpckldq mm0,  [r1+r5]
    punpckldq mm1,  [r2+r5]
    punpckldq mm2,  [r3+r5]
    punpckldq mm3,  [r4+r5]
    psadbw    mm0,  mm7
    psadbw    mm1,  mm7
    psadbw    mm2,  mm7
    psadbw    mm3,  mm7
%endmacro

%macro SAD_X4_INC_2x4P 0
    movd      mm7,  [r0]
    movd      mm4,  [r1]
    movd      mm5,  [r2]
    punpckldq mm7,  [r0+FENC_STRIDE]
    punpckldq mm4,  [r1+r5]
    punpckldq mm5,  [r2+r5]
    psadbw    mm4,  mm7
    psadbw    mm5,  mm7
    paddw     mm0,  mm4
    paddw     mm1,  mm5
    movd      mm4,  [r3]
    movd      mm5,  [r4]
    punpckldq mm4,  [r3+r5]
    punpckldq mm5,  [r4+r5]
    psadbw    mm4,  mm7
    psadbw    mm5,  mm7
    paddw     mm2,  mm4
    paddw     mm3,  mm5
%endmacro

%macro SAD_X4_2x16P 1
%if %1
    SAD_X4_START_1x8P
%else
    SAD_X4_1x8P 0, 0
%endif
    SAD_X4_1x8P 8, 8
    SAD_X4_1x8P FENC_STRIDE, r5
    SAD_X4_1x8P FENC_STRIDE+8, r5+8
    add     r0, 2*FENC_STRIDE
    lea     r1, [r1+2*r5]
    lea     r2, [r2+2*r5]
    lea     r3, [r3+2*r5]
    lea     r4, [r4+2*r5]
%endmacro

%macro SAD_X4_2x8P 1
%if %1
    SAD_X4_START_1x8P
%else
    SAD_X4_1x8P 0, 0
%endif
    SAD_X4_1x8P FENC_STRIDE, r5
    add     r0, 2*FENC_STRIDE
    lea     r1, [r1+2*r5]
    lea     r2, [r2+2*r5]
    lea     r3, [r3+2*r5]
    lea     r4, [r4+2*r5]
%endmacro

%macro SAD_X4_2x4P 1
%if %1
    SAD_X4_START_2x4P
%else
    SAD_X4_INC_2x4P
%endif
    add     r0, 2*FENC_STRIDE
    lea     r1, [r1+2*r5]
    lea     r2, [r2+2*r5]
    lea     r3, [r3+2*r5]
    lea     r4, [r4+2*r5]
%endmacro

%macro SAD_X3_END 0
%ifdef UNIX64
    movd    [r5+0], mm0
    movd    [r5+4], mm1
    movd    [r5+8], mm2
%else
    mov     r0, r5mp
    movd    [r0+0], mm0
    movd    [r0+4], mm1
    movd    [r0+8], mm2
%endif
    RET
%endmacro

%macro SAD_X4_END 0
    mov     r0, r6mp
    movd    [r0+0], mm0
    movd    [r0+4], mm1
    movd    [r0+8], mm2
    movd    [r0+12], mm3
    RET
%endmacro

;-----------------------------------------------------------------------------
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
;                          uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
%ifdef WIN64
    %assign i %1+1
    movsxd r %+ i, r %+ i %+ d
%endif
    SAD_X%1_2x%2P 1
%rep %3/2-1
    SAD_X%1_2x%2P 0
%endrep
    SAD_X%1_END
%endmacro

INIT_MMX
SAD_X 3, 16, 16
SAD_X 3, 16,  8
SAD_X 3,  8, 16
SAD_X 3,  8,  8
SAD_X 3,  8,  4
SAD_X 3,  4,  8
SAD_X 3,  4,  4
SAD_X 4, 16, 16
SAD_X 4, 16,  8
SAD_X 4,  8, 16
SAD_X 4,  8,  8
SAD_X 4,  8,  4
SAD_X 4,  4,  8
SAD_X 4,  4,  4



;=============================================================================
; SAD x3/x4 XMM
;=============================================================================

%macro SAD_X3_START_1x16P_SSE2 0
%if cpuflag(misalign)
    mova   xmm2, [r0]
    movu   xmm0, [r1]
    movu   xmm1, [r2]
    psadbw xmm0, xmm2
    psadbw xmm1, xmm2
    psadbw xmm2, [r3]
%else
    mova   xmm3, [r0]
    movu   xmm0, [r1]
    movu   xmm1, [r2]
    movu   xmm2, [r3]
    psadbw xmm0, xmm3
    psadbw xmm1, xmm3
    psadbw xmm2, xmm3
%endif
%endmacro

%macro SAD_X3_1x16P_SSE2 2
%if cpuflag(misalign)
    mova   xmm3, [r0+%1]
    movu   xmm4, [r1+%2]
    movu   xmm5, [r2+%2]
    psadbw xmm4, xmm3
    psadbw xmm5, xmm3
    psadbw xmm3, [r3+%2]
    paddw  xmm0, xmm4
    paddw  xmm1, xmm5
    paddw  xmm2, xmm3
%else
    mova   xmm3, [r0+%1]
    movu   xmm4, [r1+%2]
    movu   xmm5, [r2+%2]
    movu   xmm6, [r3+%2]
    psadbw xmm4, xmm3
    psadbw xmm5, xmm3
    psadbw xmm6, xmm3
    paddw  xmm0, xmm4
    paddw  xmm1, xmm5
    paddw  xmm2, xmm6
%endif
%endmacro

%macro SAD_X3_2x16P_SSE2 1
%if %1
    SAD_X3_START_1x16P_SSE2
%else
    SAD_X3_1x16P_SSE2 0, 0
%endif
    SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
    add  r0, 2*FENC_STRIDE
    lea  r1, [r1+2*r4]
    lea  r2, [r2+2*r4]
    lea  r3, [r3+2*r4]
%endmacro

%macro SAD_X3_START_2x8P_SSE2 0
    movq    xmm7, [r0]
    movq    xmm0, [r1]
    movq    xmm1, [r2]
    movq    xmm2, [r3]
    movhps  xmm7, [r0+FENC_STRIDE]
    movhps  xmm0, [r1+r4]
    movhps  xmm1, [r2+r4]
    movhps  xmm2, [r3+r4]
    psadbw  xmm0, xmm7
    psadbw  xmm1, xmm7
    psadbw  xmm2, xmm7
%endmacro

%macro SAD_X3_2x8P_SSE2 0
    movq    xmm7, [r0]
    movq    xmm3, [r1]
    movq    xmm4, [r2]
    movq    xmm5, [r3]
    movhps  xmm7, [r0+FENC_STRIDE]
    movhps  xmm3, [r1+r4]
    movhps  xmm4, [r2+r4]
    movhps  xmm5, [r3+r4]
    psadbw  xmm3, xmm7
    psadbw  xmm4, xmm7
    psadbw  xmm5, xmm7
    paddw   xmm0, xmm3
    paddw   xmm1, xmm4
    paddw   xmm2, xmm5
%endmacro

%macro SAD_X4_START_2x8P_SSE2 0
    movq    xmm7, [r0]
    movq    xmm0, [r1]
    movq    xmm1, [r2]
    movq    xmm2, [r3]
    movq    xmm3, [r4]
    movhps  xmm7, [r0+FENC_STRIDE]
    movhps  xmm0, [r1+r5]
    movhps  xmm1, [r2+r5]
    movhps  xmm2, [r3+r5]
    movhps  xmm3, [r4+r5]
    psadbw  xmm0, xmm7
    psadbw  xmm1, xmm7
    psadbw  xmm2, xmm7
    psadbw  xmm3, xmm7
%endmacro

%macro SAD_X4_2x8P_SSE2 0
    movq    xmm7, [r0]
    movq    xmm4, [r1]
    movq    xmm5, [r2]
%ifdef ARCH_X86_64
    movq    xmm6, [r3]
    movq    xmm8, [r4]
    movhps  xmm7, [r0+FENC_STRIDE]
    movhps  xmm4, [r1+r5]
    movhps  xmm5, [r2+r5]
    movhps  xmm6, [r3+r5]
    movhps  xmm8, [r4+r5]
    psadbw  xmm4, xmm7
    psadbw  xmm5, xmm7
    psadbw  xmm6, xmm7
    psadbw  xmm8, xmm7
    paddw   xmm0, xmm4
    paddw   xmm1, xmm5
    paddw   xmm2, xmm6
    paddw   xmm3, xmm8
%else
    movhps  xmm7, [r0+FENC_STRIDE]
    movhps  xmm4, [r1+r5]
    movhps  xmm5, [r2+r5]
    psadbw  xmm4, xmm7
    psadbw  xmm5, xmm7
    paddw   xmm0, xmm4
    paddw   xmm1, xmm5
    movq    xmm6, [r3]
    movq    xmm4, [r4]
    movhps  xmm6, [r3+r5]
    movhps  xmm4, [r4+r5]
    psadbw  xmm6, xmm7
    psadbw  xmm4, xmm7
    paddw   xmm2, xmm6
    paddw   xmm3, xmm4
%endif
%endmacro

%macro SAD_X4_START_1x16P_SSE2 0
%if cpuflag(misalign)
    mova   xmm3, [r0]
    movu   xmm0, [r1]
    movu   xmm1, [r2]
    movu   xmm2, [r3]
    psadbw xmm0, xmm3
    psadbw xmm1, xmm3
    psadbw xmm2, xmm3
    psadbw xmm3, [r4]
%else
    mova   xmm7, [r0]
    movu   xmm0, [r1]
    movu   xmm1, [r2]
    movu   xmm2, [r3]
    movu   xmm3, [r4]
    psadbw xmm0, xmm7
    psadbw xmm1, xmm7
    psadbw xmm2, xmm7
    psadbw xmm3, xmm7
%endif
%endmacro

%macro SAD_X4_1x16P_SSE2 2
%if cpuflag(misalign)
    mova   xmm7, [r0+%1]
    movu   xmm4, [r1+%2]
    movu   xmm5, [r2+%2]
    movu   xmm6, [r3+%2]
    psadbw xmm4, xmm7
    psadbw xmm5, xmm7
    psadbw xmm6, xmm7
    psadbw xmm7, [r4+%2]
    paddw  xmm0, xmm4
    paddw  xmm1, xmm5
    paddw  xmm2, xmm6
    paddw  xmm3, xmm7
%else
    mova   xmm7, [r0+%1]
    movu   xmm4, [r1+%2]
    movu   xmm5, [r2+%2]
    movu   xmm6, [r3+%2]
%ifdef ARCH_X86_64
    movu   xmm8, [r4+%2]
    psadbw xmm4, xmm7
    psadbw xmm5, xmm7
    psadbw xmm6, xmm7
    psadbw xmm8, xmm7
    paddw  xmm0, xmm4
    paddw  xmm1, xmm5
    paddw  xmm2, xmm6
    paddw  xmm3, xmm8
%else
    psadbw xmm4, xmm7
    psadbw xmm5, xmm7
    paddw  xmm0, xmm4
    psadbw xmm6, xmm7
    movu   xmm4, [r4+%2]
    paddw  xmm1, xmm5
    psadbw xmm4, xmm7
    paddw  xmm2, xmm6
    paddw  xmm3, xmm4
%endif
%endif
%endmacro

%macro SAD_X4_2x16P_SSE2 1
%if %1
    SAD_X4_START_1x16P_SSE2
%else
    SAD_X4_1x16P_SSE2 0, 0
%endif
    SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
    add  r0, 2*FENC_STRIDE
    lea  r1, [r1+2*r5]
    lea  r2, [r2+2*r5]
    lea  r3, [r3+2*r5]
    lea  r4, [r4+2*r5]
%endmacro

%macro SAD_X3_2x8P_SSE2 1
%if %1
    SAD_X3_START_2x8P_SSE2
%else
    SAD_X3_2x8P_SSE2
%endif
    add  r0, 2*FENC_STRIDE
    lea  r1, [r1+2*r4]
    lea  r2, [r2+2*r4]
    lea  r3, [r3+2*r4]
%endmacro

%macro SAD_X4_2x8P_SSE2 1
%if %1
    SAD_X4_START_2x8P_SSE2
%else
    SAD_X4_2x8P_SSE2
%endif
    add  r0, 2*FENC_STRIDE
    lea  r1, [r1+2*r5]
    lea  r2, [r2+2*r5]
    lea  r3, [r3+2*r5]
    lea  r4, [r4+2*r5]
%endmacro

%macro SAD_X3_END_SSE2 0
    movhlps xmm4, xmm0
    movhlps xmm5, xmm1
    movhlps xmm6, xmm2
    paddw   xmm0, xmm4
    paddw   xmm1, xmm5
    paddw   xmm2, xmm6
%ifdef UNIX64
    movd [r5+0], xmm0
    movd [r5+4], xmm1
    movd [r5+8], xmm2
%else
    mov      r0, r5mp
    movd [r0+0], xmm0
    movd [r0+4], xmm1
    movd [r0+8], xmm2
%endif
    RET
%endmacro

%macro SAD_X4_END_SSE2 0
    mov       r0, r6mp
    psllq   xmm1, 32
    psllq   xmm3, 32
    paddw   xmm0, xmm1
    paddw   xmm2, xmm3
    movhlps xmm1, xmm0
    movhlps xmm3, xmm2
    paddw   xmm0, xmm1
    paddw   xmm2, xmm3
    movq  [r0+0], xmm0
    movq  [r0+8], xmm2
    RET
%endmacro

;-----------------------------------------------------------------------------
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
;                          uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 3
cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
%ifdef WIN64
    %assign i %1+1
    movsxd r %+ i, r %+ i %+ d
%endif
    SAD_X%1_2x%2P_SSE2 1
%rep %3/2-1
    SAD_X%1_2x%2P_SSE2 0
%endrep
    SAD_X%1_END_SSE2
%endmacro

INIT_XMM sse2
SAD_X_SSE2 3, 16, 16
SAD_X_SSE2 3, 16,  8
SAD_X_SSE2 3,  8, 16
SAD_X_SSE2 3,  8,  8
SAD_X_SSE2 3,  8,  4
SAD_X_SSE2 4, 16, 16
SAD_X_SSE2 4, 16,  8
SAD_X_SSE2 4,  8, 16
SAD_X_SSE2 4,  8,  8
SAD_X_SSE2 4,  8,  4

INIT_XMM sse2, misalign
SAD_X_SSE2 3, 16, 16
SAD_X_SSE2 3, 16,  8
SAD_X_SSE2 4, 16, 16
SAD_X_SSE2 4, 16,  8

INIT_XMM sse3
SAD_X_SSE2 3, 16, 16
SAD_X_SSE2 3, 16,  8
SAD_X_SSE2 4, 16, 16
SAD_X_SSE2 4, 16,  8



;=============================================================================
; SAD cacheline split
;=============================================================================

; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
; unless the unaligned data spans the border between 2 cachelines, in which
; case it's really slow. The exact numbers may differ, but all Intel cpus prior
; to Nehalem have a large penalty for cacheline splits.
; (8-byte alignment exactly half way between two cachelines is ok though.)
; LDDQU was supposed to fix this, but it only works on Pentium 4.
; So in the split case we load aligned data and explicitly perform the
; alignment between registers. Like on archs that have only aligned loads,
; except complicated by the fact that PALIGNR takes only an immediate, not
; a variable alignment.
; It is also possible to hoist the realignment to the macroblock level (keep
; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
; needed for that method makes it often slower.

; sad 16x16 costs on Core2:
; good offsets: 49 cycles (50/64 of all mvs)
; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)

; computed jump assumes this loop is exactly 80 bytes
%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
ALIGN 16
sad_w16_align%1_sse2:
    movdqa  xmm1, [r2+16]
    movdqa  xmm2, [r2+r3+16]
    movdqa  xmm3, [r2]
    movdqa  xmm4, [r2+r3]
    pslldq  xmm1, 16-%1
    pslldq  xmm2, 16-%1
    psrldq  xmm3, %1
    psrldq  xmm4, %1
    por     xmm1, xmm3
    por     xmm2, xmm4
    psadbw  xmm1, [r0]
    psadbw  xmm2, [r0+r1]
    paddw   xmm0, xmm1
    paddw   xmm0, xmm2
    lea     r0,   [r0+2*r1]
    lea     r2,   [r2+2*r3]
    dec     r4
    jg sad_w16_align%1_sse2
    ret
%endmacro

; computed jump assumes this loop is exactly 64 bytes
%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
ALIGN 16
sad_w16_align%1_ssse3:
    movdqa  xmm1, [r2+16]
    movdqa  xmm2, [r2+r3+16]
    palignr xmm1, [r2], %1
    palignr xmm2, [r2+r3], %1
    psadbw  xmm1, [r0]
    psadbw  xmm2, [r0+r1]
    paddw   xmm0, xmm1
    paddw   xmm0, xmm2
    lea     r0,   [r0+2*r1]
    lea     r2,   [r2+2*r3]
    dec     r4
    jg sad_w16_align%1_ssse3
    ret
%endmacro

%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
cglobal pixel_sad_16x%2_cache64_%1
    mov     eax, r2m
    and     eax, 0x37
    cmp     eax, 0x30
    jle pixel_sad_16x%2_sse2
    PROLOGUE 4,6
    mov     r4d, r2d
    and     r4d, 15
%ifidn %1, ssse3
    shl     r4d, 6  ; code size = 64
%else
    lea     r4, [r4*5]
    shl     r4d, 4  ; code size = 80
%endif
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
%ifdef PIC
    lea     r5, [sad_w16_addr]
    add     r5, r4
%else
    lea     r5, [sad_w16_addr + r4]
%endif
    and     r2, ~15
    mov     r4d, %2/2
    pxor    xmm0, xmm0
    call    r5
    movhlps xmm1, xmm0
    paddw   xmm0, xmm1
    movd    eax,  xmm0
    RET
%endmacro

%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
    mov    eax, r2m
    and    eax, 0x17|%1|(%4>>1)
    cmp    eax, 0x10|%1|(%4>>1)
    jle pixel_sad_%1x%2_mmx2
    and    eax, 7
    shl    eax, 3
    movd   mm6, [sw_64]
    movd   mm7, eax
    psubw  mm6, mm7
    PROLOGUE 4,5
    and    r2, ~7
    mov    r4d, %3
    pxor   mm0, mm0
%endmacro

%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
cglobal pixel_sad_16x%1_cache%2_mmx2
    SAD_CACHELINE_START_MMX2 16, %1, %1, %2
.loop:
    movq   mm1, [r2]
    movq   mm2, [r2+8]
    movq   mm3, [r2+16]
    movq   mm4, mm2
    psrlq  mm1, mm7
    psllq  mm2, mm6
    psllq  mm3, mm6
    psrlq  mm4, mm7
    por    mm1, mm2
    por    mm3, mm4
    psadbw mm1, [r0]
    psadbw mm3, [r0+8]
    paddw  mm0, mm1
    paddw  mm0, mm3
    add    r2, r3
    add    r0, r1
    dec    r4
    jg .loop
    movd   eax, mm0
    RET
%endmacro

%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
cglobal pixel_sad_8x%1_cache%2_mmx2
    SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
.loop:
    movq   mm1, [r2+8]
    movq   mm2, [r2+r3+8]
    movq   mm3, [r2]
    movq   mm4, [r2+r3]
    psllq  mm1, mm6
    psllq  mm2, mm6
    psrlq  mm3, mm7
    psrlq  mm4, mm7
    por    mm1, mm3
    por    mm2, mm4
    psadbw mm1, [r0]
    psadbw mm2, [r0+r1]
    paddw  mm0, mm1
    paddw  mm0, mm2
    lea    r2, [r2+2*r3]
    lea    r0, [r0+2*r1]
    dec    r4
    jg .loop
    movd   eax, mm0
    RET
%endmacro

; sad_x3/x4_cache64: check each mv.
; if they're all within a cacheline, use normal sad_x3/x4.
; otherwise, send them individually to sad_cache64.
%macro CHECK_SPLIT 3 ; pix, width, cacheline
    mov  eax, %1
    and  eax, 0x17|%2|(%3>>1)
    cmp  eax, 0x10|%2|(%3>>1)
    jg .split
%endmacro

%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
cglobal pixel_sad_x3_%1x%2_cache%3_%6
    CHECK_SPLIT r1m, %1, %3
    CHECK_SPLIT r2m, %1, %3
    CHECK_SPLIT r3m, %1, %3
    jmp pixel_sad_x3_%1x%2_%4
.split:
%ifdef ARCH_X86_64
    PROLOGUE 6,7
%ifdef WIN64
    movsxd r4, r4d
    sub  rsp, 8
%endif
    push r3
    push r2
    mov  r2, r1
    mov  r1, FENC_STRIDE
    mov  r3, r4
    mov  r10, r0
    mov  r11, r5
    call pixel_sad_%1x%2_cache%3_%5
    mov  [r11], eax
%ifdef WIN64
    mov  r2, [rsp]
%else
    pop  r2
%endif
    mov  r0, r10
    call pixel_sad_%1x%2_cache%3_%5
    mov  [r11+4], eax
%ifdef WIN64
    mov  r2, [rsp+8]
%else
    pop  r2
%endif
    mov  r0, r10
    call pixel_sad_%1x%2_cache%3_%5
    mov  [r11+8], eax
%ifdef WIN64
    add  rsp, 24
%endif
    RET
%else
    push edi
    mov  edi, [esp+28]
    push dword [esp+24]
    push dword [esp+16]
    push dword 16
    push dword [esp+20]
    call pixel_sad_%1x%2_cache%3_%5
    mov  ecx, [esp+32]
    mov  [edi], eax
    mov  [esp+8], ecx
    call pixel_sad_%1x%2_cache%3_%5
    mov  ecx, [esp+36]
    mov  [edi+4], eax
    mov  [esp+8], ecx
    call pixel_sad_%1x%2_cache%3_%5
    mov  [edi+8], eax
    add  esp, 16
    pop  edi
    ret
%endif
%endmacro

%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
cglobal pixel_sad_x4_%1x%2_cache%3_%6
    CHECK_SPLIT r1m, %1, %3
    CHECK_SPLIT r2m, %1, %3
    CHECK_SPLIT r3m, %1, %3
    CHECK_SPLIT r4m, %1, %3
    jmp pixel_sad_x4_%1x%2_%4
.split:
%ifdef ARCH_X86_64
    PROLOGUE 6,7
    mov  r11,  r6mp
%ifdef WIN64
    movsxd r5, r5d
%endif
    push r4
    push r3
    push r2
    mov  r2, r1
    mov  r1, FENC_STRIDE
    mov  r3, r5
    mov  r10, r0
    call pixel_sad_%1x%2_cache%3_%5
    mov  [r11], eax
%ifdef WIN64
    mov  r2, [rsp]
%else
    pop  r2
%endif
    mov  r0, r10
    call pixel_sad_%1x%2_cache%3_%5
    mov  [r11+4], eax
%ifdef WIN64
    mov  r2, [rsp+8]
%else
    pop  r2
%endif
    mov  r0, r10
    call pixel_sad_%1x%2_cache%3_%5
    mov  [r11+8], eax
%ifdef WIN64
    mov  r2, [rsp+16]
%else
    pop  r2
%endif
    mov  r0, r10
    call pixel_sad_%1x%2_cache%3_%5
    mov  [r11+12], eax
%ifdef WIN64
    add  rsp, 24
%endif
    RET
%else
    push edi
    mov  edi, [esp+32]
    push dword [esp+28]
    push dword [esp+16]
    push dword 16
    push dword [esp+20]
    call pixel_sad_%1x%2_cache%3_%5
    mov  ecx, [esp+32]
    mov  [edi], eax
    mov  [esp+8], ecx
    call pixel_sad_%1x%2_cache%3_%5
    mov  ecx, [esp+36]
    mov  [edi+4], eax
    mov  [esp+8], ecx
    call pixel_sad_%1x%2_cache%3_%5
    mov  ecx, [esp+40]
    mov  [edi+8], eax
    mov  [esp+8], ecx
    call pixel_sad_%1x%2_cache%3_%5
    mov  [edi+12], eax
    add  esp, 16
    pop  edi
    ret
%endif
%endmacro

%macro SADX34_CACHELINE_FUNC 1+
    SADX3_CACHELINE_FUNC %1
    SADX4_CACHELINE_FUNC %1
%endmacro


; instantiate the aligned sads

INIT_MMX
%ifndef ARCH_X86_64
SAD16_CACHELINE_FUNC_MMX2  8, 32
SAD16_CACHELINE_FUNC_MMX2 16, 32
SAD8_CACHELINE_FUNC_MMX2   4, 32
SAD8_CACHELINE_FUNC_MMX2   8, 32
SAD8_CACHELINE_FUNC_MMX2  16, 32
SAD16_CACHELINE_FUNC_MMX2  8, 64
SAD16_CACHELINE_FUNC_MMX2 16, 64
%endif ; !ARCH_X86_64
SAD8_CACHELINE_FUNC_MMX2   4, 64
SAD8_CACHELINE_FUNC_MMX2   8, 64
SAD8_CACHELINE_FUNC_MMX2  16, 64

%ifndef ARCH_X86_64
SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC 16,  8, 32, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC  8, 16, 32, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC  8,  8, 32, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC 16,  8, 64, mmx2, mmx2, mmx2
%endif ; !ARCH_X86_64
SADX34_CACHELINE_FUNC  8, 16, 64, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC  8,  8, 64, mmx2, mmx2, mmx2

%ifndef ARCH_X86_64
SAD16_CACHELINE_FUNC sse2, 8
SAD16_CACHELINE_FUNC sse2, 16
%assign i 1
%rep 15
SAD16_CACHELINE_LOOP_SSE2 i
%assign i i+1
%endrep
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
SADX34_CACHELINE_FUNC 16,  8, 64, sse2, sse2, sse2
%endif ; !ARCH_X86_64
SADX34_CACHELINE_FUNC  8, 16, 64, sse2, mmx2, sse2

SAD16_CACHELINE_FUNC ssse3, 8
SAD16_CACHELINE_FUNC ssse3, 16
%assign i 1
%rep 15
SAD16_CACHELINE_LOOP_SSSE3 i
%assign i i+1
%endrep
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
SADX34_CACHELINE_FUNC 16,  8, 64, sse2, ssse3, ssse3

x264-snapshot-20120103-2245-stable/common/x86/quant.h0000644000175000001440000001540611700673342021115 0ustar  videolanusers/*****************************************************************************
 * quant.h: x86 quantization and level-run
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Christian Heine <sennindemokrit@gmx.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_I386_QUANT_H
#define X264_I386_QUANT_H

int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf );
void x264_denoise_dct_mmx  ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx  ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int x264_decimate_score15_mmx2( dctcoef *dct );
int x264_decimate_score15_sse2( dctcoef *dct );
int x264_decimate_score15_ssse3( dctcoef *dct );
int x264_decimate_score16_mmx2( dctcoef *dct );
int x264_decimate_score16_sse2( dctcoef *dct );
int x264_decimate_score16_ssse3( dctcoef *dct );
int x264_decimate_score15_mmx2_slowctz( dctcoef *dct );
int x264_decimate_score15_sse2_slowctz( dctcoef *dct );
int x264_decimate_score15_ssse3_slowctz( dctcoef *dct );
int x264_decimate_score16_mmx2_slowctz( dctcoef *dct );
int x264_decimate_score16_sse2_slowctz( dctcoef *dct );
int x264_decimate_score16_ssse3_slowctz( dctcoef *dct );
int x264_decimate_score64_mmx2( dctcoef *dct );
int x264_decimate_score64_sse2( dctcoef *dct );
int x264_decimate_score64_ssse3( dctcoef *dct );
int x264_coeff_last4_mmx2( dctcoef *dct );
int x264_coeff_last8_mmx2( dctcoef *dct );
int x264_coeff_last15_mmx2( dctcoef *dct );
int x264_coeff_last16_mmx2( dctcoef *dct );
int x264_coeff_last64_mmx2( dctcoef *dct );
int x264_coeff_last8_sse2( dctcoef *dct );
int x264_coeff_last15_sse2( dctcoef *dct );
int x264_coeff_last16_sse2( dctcoef *dct );
int x264_coeff_last64_sse2( dctcoef *dct );
int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct );
int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct );
int x264_coeff_last8_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );

#endif
x264-snapshot-20120103-2245-stable/common/x86/quant-a.asm0000644000175000001440000007605711700673342021675 0ustar  videolanusers;*****************************************************************************
;* quant-a.asm: x86 quantization and level-run
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*          Christian Heine <sennindemokrit@gmx.net>
;*          Oskar Arvidsson <oskar@irock.se>
;*          Henrik Gramner <hengar-6@student.ltu.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA

%macro DQM4 3
    dw %1, %2, %1, %2, %2, %3, %2, %3
%endmacro
%macro DQM8 6
    dw %1, %4, %5, %4, %1, %4, %5, %4
    dw %4, %2, %6, %2, %4, %2, %6, %2
    dw %5, %6, %3, %6, %5, %6, %3, %6
    ; last line not used, just padding for power-of-2 stride
    times 8 dw 0
%endmacro

dequant4_scale:
    DQM4 10, 13, 16
    DQM4 11, 14, 18
    DQM4 13, 16, 20
    DQM4 14, 18, 23
    DQM4 16, 20, 25
    DQM4 18, 23, 29

dequant8_scale:
    DQM8 20, 18, 32, 19, 25, 24
    DQM8 22, 19, 35, 21, 28, 26
    DQM8 26, 23, 42, 24, 33, 31
    DQM8 28, 25, 45, 26, 35, 33
    DQM8 32, 28, 51, 30, 40, 38
    DQM8 36, 32, 58, 34, 46, 43

decimate_mask_table4:
    db  0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
    db  3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
    db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
    db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
    db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
    db  6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
    db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
    db  9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
    db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24

chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
chroma_dc_dct_mask:     dw 1, 1,-1,-1, 1, 1,-1,-1
chroma_dc_dmf_mask:     dw 1, 1,-1,-1, 1,-1,-1, 1

SECTION .text

cextern pb_1
cextern pw_1
cextern pd_1
cextern pb_01
cextern pd_1024

%macro QUANT_DC_START 0
    movd       m6, r1m     ; mf
    movd       m7, r2m     ; bias
%ifdef HIGH_BIT_DEPTH
    SPLATD     m6, m6
    SPLATD     m7, m7
%elif cpuflag(sse4) ; ssse3, but not faster on conroe
    movdqa     m5, [pb_01]
    pshufb     m6, m5
    pshufb     m7, m5
%else
    SPLATW     m6, m6
    SPLATW     m7, m7
%endif
%endmacro

%macro QUANT_END 0
%if cpuflag(sse4)
    xor      eax, eax
    ptest     m5, m5
    setne     al
%else ; !sse4
    xor      eax, eax
%ifdef ARCH_X86_64
%if mmsize == 16
    packsswb  m5, m5
%endif
    movq     rcx, m5
    test     rcx, rcx
%else
%if mmsize == 16
    pxor      m4, m4
    pcmpeqb   m5, m4
    pmovmskb ecx, m5
    cmp      ecx, (1<<mmsize)-1
%else
    packsswb  m5, m5
    movd     ecx, m5
    test     ecx, ecx
%endif
%endif
    setne     al
%endif ; cpuflag
%endmacro

%ifdef HIGH_BIT_DEPTH
%macro QUANT_ONE_DC 4
%if cpuflag(sse4)
    mova        m0, [%1]
    ABSD        m1, m0
    paddd       m1, %3
    pmulld      m1, %2
    psrad       m1, 16
    PSIGND      m1, m0
    mova      [%1], m1
%if %4
    por         m5, m1
%else
    SWAP         5, 1
%endif
%else ; !sse4
    mova        m0, [%1]
    ABSD        m1, m0
    paddd       m1, %3
    mova        m2, m1
    psrlq       m2, 32
    pmuludq     m1, %2
    pmuludq     m2, %2
    psllq       m2, 32
    paddd       m1, m2
    psrld       m1, 16
    PSIGND      m1, m0
    mova      [%1], m1
%if %4
    por         m5, m1
%else
    SWAP         5, 1
%endif
%endif ; cpuflag
%endmacro

%macro QUANT_TWO_DC 4
%if cpuflag(sse4)
    mova        m0, [%1]
    mova        m1, [%1+mmsize]
    ABSD        m2, m0
    ABSD        m3, m1
    paddd       m2, %3
    paddd       m3, %3
    pmulld      m2, %2
    pmulld      m3, %2
    psrad       m2, 16
    psrad       m3, 16
    PSIGND      m2, m0
    PSIGND      m3, m1
    mova      [%1], m2
    mova      [%1+mmsize], m3
%if %4
    por         m5, m2
%else
    SWAP         5, 2
%endif
    por         m5, m3
%else ; !sse4
    QUANT_ONE_DC %1, %2, %3, %4
    QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
%endif ; cpuflag
%endmacro

%macro QUANT_ONE_AC_MMX 4
    mova        m0, [%1]
    mova        m2, [%2]
    ABSD        m1, m0
    mova        m4, m2
    paddd       m1, [%3]
    mova        m3, m1
    psrlq       m4, 32
    psrlq       m3, 32
    pmuludq     m1, m2
    pmuludq     m3, m4
    psllq       m3, 32
    paddd       m1, m3
    psrad       m1, 16
    PSIGND      m1, m0
    mova      [%1], m1
%if %4
    por         m5, m1
%else
    SWAP         5, 1
%endif
%endmacro

%macro QUANT_TWO_AC 4
%if cpuflag(sse4)
    mova        m0, [%1]
    mova        m1, [%1+mmsize]
    ABSD        m2, m0
    ABSD        m3, m1
    paddd       m2, [%3]
    paddd       m3, [%3+mmsize]
    pmulld      m2, [%2]
    pmulld      m3, [%2+mmsize]
    psrad       m2, 16
    psrad       m3, 16
    PSIGND      m2, m0
    PSIGND      m3, m1
    mova      [%1], m2
    mova      [%1+mmsize], m3
%if %4
    por         m5, m2
%else
    SWAP         5,  2
%endif
    por         m5, m3
%else ; !sse4
    QUANT_ONE_AC_MMX %1, %2, %3, %4
    QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
%endif ; cpuflag
%endmacro

;-----------------------------------------------------------------------------
; int quant_2x2( int32_t dct[M*N], int mf, int bias )
;-----------------------------------------------------------------------------
%macro QUANT_DC 2
cglobal quant_%1x%2_dc, 3,3,8
    QUANT_DC_START
%if %1*%2 <= mmsize/4
    QUANT_ONE_DC r0, m6, m7, 0
%else
%assign x 0
%rep %1*%2/(mmsize/2)
    QUANT_TWO_DC r0+x, m6, m7, x
%assign x x+mmsize*2
%endrep
%endif
    QUANT_END
    RET
%endmacro

;-----------------------------------------------------------------------------
; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
cglobal quant_%1x%2, 3,3,8
%assign x 0
%rep %1*%2/(mmsize/2)
    QUANT_TWO_AC r0+x, r1+x, r2+x, x
%assign x x+mmsize*2
%endrep
    QUANT_END
    RET
%endmacro

INIT_XMM sse2
QUANT_DC 2, 2
QUANT_DC 4, 4
QUANT_AC 4, 4
QUANT_AC 8, 8

INIT_XMM ssse3
QUANT_DC 2, 2
QUANT_DC 4, 4
QUANT_AC 4, 4
QUANT_AC 8, 8

INIT_XMM sse4
QUANT_DC 2, 2
QUANT_DC 4, 4
QUANT_AC 4, 4
QUANT_AC 8, 8

%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
%macro QUANT_ONE 4
;;; %1      (m64)       dct[y][x]
;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
    mova       m1, %1   ; load dct coeffs
    ABSW       m0, m1, sign
    paddusw    m0, %3   ; round
    pmulhuw    m0, %2   ; divide
    PSIGNW     m0, m1   ; restore sign
    mova       %1, m0   ; store
%if %4
    por        m5, m0
%else
    SWAP        5, 0
%endif
%endmacro

%macro QUANT_TWO 7
    mova       m1, %1
    mova       m3, %2
    ABSW       m0, m1, sign
    ABSW       m2, m3, sign
    paddusw    m0, %5
    paddusw    m2, %6
    pmulhuw    m0, %3
    pmulhuw    m2, %4
    PSIGNW     m0, m1
    PSIGNW     m2, m3
    mova       %1, m0
    mova       %2, m2
%if %7
    por        m5, m0
    por        m5, m2
%else
    SWAP        5,  0
    por        m5, m2
%endif
%endmacro

;-----------------------------------------------------------------------------
; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
%macro QUANT_DC 2-3 0
cglobal %1, 1,1,%3
    QUANT_DC_START
%if %2==1
    QUANT_ONE [r0], m6, m7, 0
%else
%assign x 0
%rep %2/2
    QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
%assign x x+mmsize*2
%endrep
%endif
    QUANT_END
    RET
%endmacro

;-----------------------------------------------------------------------------
; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
cglobal %1, 3,3
%assign x 0
%rep %2/2
    QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
%assign x x+mmsize*2
%endrep
    QUANT_END
    RET
%endmacro

INIT_MMX mmx2
QUANT_DC quant_2x2_dc, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
QUANT_DC quant_4x4_dc, 4
INIT_MMX mmx
QUANT_AC quant_4x4, 4
QUANT_AC quant_8x8, 16
%endif

INIT_XMM sse2
QUANT_DC quant_4x4_dc, 2, 8
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8

INIT_XMM ssse3
QUANT_DC quant_4x4_dc, 2, 8
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8

INIT_MMX ssse3
QUANT_DC quant_2x2_dc, 1

INIT_XMM sse4
;Not faster on Conroe, so only used in SSE4 versions
QUANT_DC quant_4x4_dc, 2, 8
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8
%endif ; !HIGH_BIT_DEPTH



;=============================================================================
; dequant
;=============================================================================

%macro DEQUANT16_L 3
;;; %1      dct[y][x]
;;; %2,%3   dequant_mf[i_mf][y][x]
;;; m2      i_qbits
    mova     m0, %2
%ifdef HIGH_BIT_DEPTH
    pmaddwd  m0, %1
    pslld    m0, m2
%else
    packssdw m0, %3
    pmullw   m0, %1
    psllw    m0, m2
%endif
    mova     %1, m0
%endmacro

%macro DEQUANT32_R 3
;;; %1      dct[y][x]
;;; %2,%3   dequant_mf[i_mf][y][x]
;;; m2      -i_qbits
;;; m3      f
;;; m4      0
    mova      m0, %1
%ifdef HIGH_BIT_DEPTH
    pmadcswd   m0, m0, %2, m3
    psrad     m0, m2
%else
    punpckhwd m1, m0, m4
    punpcklwd m0, m4
    pmadcswd  m0, m0, %2, m3
    pmadcswd  m1, m1, %3, m3
    psrad     m0, m2
    psrad     m1, m2
    packssdw  m0, m1
%endif
    mova      %1, m0
%endmacro

%macro DEQUANT_LOOP 3
%if 8*(%2-2*%3)
    mov t0d, 8*(%2-2*%3)
%%loop:
    %1 [r0+(t0     )*SIZEOF_PIXEL], [r1+t0*2      ], [r1+t0*2+ 8*%3]
    %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
    sub t0d, 16*%3
    jge %%loop
    REP_RET
%else
    %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
    %1 [r0+(0   )*SIZEOF_PIXEL], [r1+0    ], [r1+ 8*%3]
    RET
%endif
%endmacro

%macro DEQUANT16_FLAT 2-5
    mova   m0, %1
    psllw  m0, m4
%assign i %0-2
%rep %0-1
%if i
    mova   m %+ i, [r0+%2]
    pmullw m %+ i, m0
%else
    pmullw m0, [r0+%2]
%endif
    mova   [r0+%2], m %+ i
    %assign i i-1
    %rotate 1
%endrep
%endmacro

%ifdef WIN64
    DECLARE_REG_TMP 6,3,2
%elifdef ARCH_X86_64
    DECLARE_REG_TMP 4,3,2
%else
    DECLARE_REG_TMP 2,0,1
%endif

%macro DEQUANT_START 2
    movifnidn t2d, r2m
    imul t0d, t2d, 0x2b
    shr  t0d, 8     ; i_qbits = i_qp / 6
    lea  t1, [t0*3]
    sub  t2d, t1d
    sub  t2d, t1d   ; i_mf = i_qp % 6
    shl  t2d, %1
%ifdef ARCH_X86_64
    add  r1, t2     ; dequant_mf[i_mf]
%else
    add  r1, r1mp   ; dequant_mf[i_mf]
    mov  r0, r0mp   ; dct
%endif
    sub  t0d, %2
    jl   .rshift32  ; negative qbits => rightshift
%endmacro

;-----------------------------------------------------------------------------
; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 3
cglobal dequant_%1x%1, 0,3,6
.skip_prologue:
    DEQUANT_START %2+2, %2

.lshift:
    movd m2, t0d
    DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3

.rshift32:
    neg   t0d
    movd  m2, t0d
    mova  m3, [pd_1]
    pxor  m4, m4
    pslld m3, m2
    psrld m3, 1
    DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3

%ifndef HIGH_BIT_DEPTH
%if notcpuflag(avx)
cglobal dequant_%1x%1_flat16, 0,3
    movifnidn t2d, r2m
%if %1 == 8
    cmp  t2d, 12
    jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
    sub  t2d, 12
%endif
    imul t0d, t2d, 0x2b
    shr  t0d, 8     ; i_qbits = i_qp / 6
    lea  t1, [t0*3]
    sub  t2d, t1d
    sub  t2d, t1d   ; i_mf = i_qp % 6
    shl  t2d, %2
%ifdef PIC
    lea  r1, [dequant%1_scale]
    add  r1, t2
%else
    lea  r1, [dequant%1_scale + t2]
%endif
    movifnidn r0, r0mp
    movd m4, t0d
%if %1 == 4
%if mmsize == 8
    DEQUANT16_FLAT [r1], 0, 16
    DEQUANT16_FLAT [r1+8], 8, 24
%else
    DEQUANT16_FLAT [r1], 0, 16
%endif
%elif mmsize == 8
    DEQUANT16_FLAT [r1], 0, 8, 64, 72
    DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
    DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
    DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
%else
    DEQUANT16_FLAT [r1], 0, 64
    DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
    DEQUANT16_FLAT [r1+32], 32, 96
%endif
    RET
%endif ; !AVX
%endif ; !HIGH_BIT_DEPTH
%endmacro ; DEQUANT

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
INIT_XMM xop
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
%else
%ifndef ARCH_X86_64
INIT_MMX mmx
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
%endif
INIT_XMM sse2
DEQUANT 4, 4, 2
DEQUANT 8, 6, 2
INIT_XMM avx
DEQUANT 4, 4, 2
DEQUANT 8, 6, 2
INIT_XMM xop
DEQUANT 4, 4, 2
DEQUANT 8, 6, 2
%endif

%macro DEQUANT_DC 2
cglobal dequant_4x4dc, 0,3,6
    DEQUANT_START 6, 6

.lshift:
    movd     m3, [r1]
    movd     m2, t0d
    pslld    m3, m2
    SPLAT%1  m3, m3, 0
%assign x 0
%rep SIZEOF_PIXEL*16/mmsize
    mova     m0, [r0+mmsize*0+x]
    mova     m1, [r0+mmsize*1+x]
    %2       m0, m3
    %2       m1, m3
    mova     [r0+mmsize*0+x], m0
    mova     [r0+mmsize*1+x], m1
%assign x x+mmsize*2
%endrep
    RET

.rshift32:
    neg   t0d
    movd  m3, t0d
    mova  m4, [p%1_1]
    mova  m5, m4
    pslld m4, m3
    psrld m4, 1
    movd  m2, [r1]
%assign x 0
%ifdef HIGH_BIT_DEPTH
    pshufd m2, m2, 0
%rep SIZEOF_PIXEL*32/mmsize
    mova      m0, [r0+x]
    pmadcswd  m0, m0, m2, m4
    psrad     m0, m3
    mova      [r0+x], m0
%assign x x+mmsize
%endrep

%else ; !HIGH_BIT_DEPTH
    PSHUFLW   m2, m2, 0
    punpcklwd m2, m4
%rep SIZEOF_PIXEL*32/mmsize
    mova      m0, [r0+x]
    punpckhwd m1, m0, m5
    punpcklwd m0, m5
    pmaddwd   m0, m2
    pmaddwd   m1, m2
    psrad     m0, m3
    psrad     m1, m3
    packssdw  m0, m1
    mova      [r0+x], m0
%assign x x+mmsize
%endrep
%endif ; !HIGH_BIT_DEPTH
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT_DC d, pmaddwd
INIT_XMM xop
DEQUANT_DC d, pmaddwd
%else
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEQUANT_DC w, pmullw
%endif
INIT_XMM sse2
DEQUANT_DC w, pmullw
INIT_XMM avx
DEQUANT_DC w, pmullw
%endif

; t4 is eax for return value.
%ifdef ARCH_X86_64
    DECLARE_REG_TMP 0,1,2,3,6,4  ; Identical for both Windows and *NIX
%else
    DECLARE_REG_TMP 4,1,2,3,0,5
%endif

;-----------------------------------------------------------------------------
; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
;-----------------------------------------------------------------------------

%macro OPTIMIZE_CHROMA_2x2_DC 0
%assign %%regs 5
%if cpuflag(sse4)
    %assign %%regs %%regs-1
%endif
%ifndef ARCH_X86_64
    %assign %%regs %%regs+1      ; t0-t4 are volatile on x86-64
%endif
cglobal optimize_chroma_2x2_dc, 0,%%regs,7
    movifnidn t0, r0mp
    movd      m2, r1m
    movq      m1, [t0]
%if cpuflag(sse4)
    pcmpeqb   m4, m4
    pslld     m4, 11
%else
    pxor      m4, m4
%endif
%if cpuflag(ssse3)
    mova      m3, [chroma_dc_dct_mask]
    mova      m5, [chroma_dc_dmf_mask]
%else
    mova      m3, [chroma_dc_dct_mask_mmx]
    mova      m5, [chroma_dc_dmf_mask_mmx]
%endif
    pshuflw   m2, m2, 0
    pshufd    m0, m1, q0101      ;  1  0  3  2  1  0  3  2
    punpcklqdq m2, m2
    punpcklqdq m1, m1            ;  3  2  1  0  3  2  1  0
    mova      m6, [pd_1024]      ; 32<<5, elements are shifted 5 bits to the left
    PSIGNW    m0, m3             ; -1 -0  3  2 -1 -0  3  2
    PSIGNW    m2, m5             ;  +  -  -  +  -  -  +  +
    paddw     m0, m1             ; -1+3 -0+2  1+3  0+2 -1+3 -0+2  1+3  0+2
    pmaddwd   m0, m2             ;  0-1-2+3  0-1+2-3  0+1-2-3  0+1+2+3  * dmf
    punpcklwd m1, m1
    psrad     m2, 16             ;  +  -  -  +
    mov      t1d, 3
    paddd     m0, m6
    xor      t4d, t4d
%if notcpuflag(ssse3)
    psrad     m1, 31             ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
%endif
%if cpuflag(sse4)
    ptest     m0, m4
%else
    mova      m6, m0
    SWAP       0, 6
    psrad     m6, 11
    pcmpeqd   m6, m4
    pmovmskb t5d, m6
    cmp      t5d, 0xffff
%endif
    jz .ret                      ; if the DC coefficients already round to zero, terminate early
    mova      m3, m0
.outer_loop:
    movsx    t3d, word [t0+2*t1] ; dct[coeff]
    pshufd    m6, m1, q3333
    pshufd    m1, m1, q2100      ; move the next element to high dword
    PSIGND    m5, m2, m6
    test     t3d, t3d
    jz .loop_end
.outer_loop_0:
    mov      t2d, t3d
    sar      t3d, 31
    or       t3d, 1
.inner_loop:
    psubd     m3, m5             ; coeff -= sign
    pxor      m6, m0, m3
%if cpuflag(sse4)
    ptest     m6, m4
%else
    psrad     m6, 11
    pcmpeqd   m6, m4
    pmovmskb t5d, m6
    cmp      t5d, 0xffff
%endif
    jz .round_coeff
    paddd     m3, m5             ; coeff += sign
    mov      t4d, 1
.loop_end:
    dec      t1d
    jz .last_coeff
    pshufd    m2, m2, q1320      ;  -  +  -  +  /  -  -  +  +
    jg .outer_loop
.ret:
    REP_RET
.round_coeff:
    sub      t2d, t3d
    mov [t0+2*t1], t2w
    jnz .inner_loop
    jmp .loop_end
.last_coeff:
    movsx    t3d, word [t0]
    punpcklqdq m2, m2            ;  +  +  +  +
    PSIGND    m5, m2, m1
    test     t3d, t3d
    jnz .outer_loop_0
    REP_RET
%endmacro

%ifndef HIGH_BIT_DEPTH
INIT_XMM sse2
OPTIMIZE_CHROMA_2x2_DC
INIT_XMM ssse3
OPTIMIZE_CHROMA_2x2_DC
INIT_XMM sse4
OPTIMIZE_CHROMA_2x2_DC
INIT_XMM avx
OPTIMIZE_CHROMA_2x2_DC
%endif ; !HIGH_BIT_DEPTH

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 0
cglobal denoise_dct, 4,4,8
    pxor      m6, m6
.loop:
    mova      m2, [r0+r3*4-2*mmsize]
    mova      m3, [r0+r3*4-1*mmsize]
    ABSD      m0, m2
    ABSD      m1, m3
    mova      m4, m0
    mova      m5, m1
    psubd     m0, [r2+r3*4-2*mmsize]
    psubd     m1, [r2+r3*4-1*mmsize]
    pcmpgtd   m7, m0, m6
    pand      m0, m7
    pcmpgtd   m7, m1, m6
    pand      m1, m7
    PSIGND    m0, m2
    PSIGND    m1, m3
    mova      [r0+r3*4-2*mmsize], m0
    mova      [r0+r3*4-1*mmsize], m1
    paddd     m4, [r1+r3*4-2*mmsize]
    paddd     m5, [r1+r3*4-1*mmsize]
    mova      [r1+r3*4-2*mmsize], m4
    mova      [r1+r3*4-1*mmsize], m5
    sub       r3, mmsize/2
    jg .loop
    REP_RET
%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx
DENOISE_DCT
%endif
INIT_XMM sse2
DENOISE_DCT
INIT_XMM ssse3
DENOISE_DCT
INIT_XMM avx
DENOISE_DCT

%else ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 0
cglobal denoise_dct, 4,4,7
    pxor      m6, m6
.loop:
    mova      m2, [r0+r3*2-2*mmsize]
    mova      m3, [r0+r3*2-1*mmsize]
    ABSW      m0, m2, sign
    ABSW      m1, m3, sign
    psubusw   m4, m0, [r2+r3*2-2*mmsize]
    psubusw   m5, m1, [r2+r3*2-1*mmsize]
    PSIGNW    m4, m2
    PSIGNW    m5, m3
    mova      [r0+r3*2-2*mmsize], m4
    mova      [r0+r3*2-1*mmsize], m5
    punpcklwd m2, m0, m6
    punpcklwd m3, m1, m6
    punpckhwd m0, m6
    punpckhwd m1, m6
    paddd     m2, [r1+r3*4-4*mmsize]
    paddd     m0, [r1+r3*4-3*mmsize]
    paddd     m3, [r1+r3*4-2*mmsize]
    paddd     m1, [r1+r3*4-1*mmsize]
    mova      [r1+r3*4-4*mmsize], m2
    mova      [r1+r3*4-3*mmsize], m0
    mova      [r1+r3*4-2*mmsize], m3
    mova      [r1+r3*4-1*mmsize], m1
    sub       r3, mmsize
    jg .loop
    REP_RET
%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx
DENOISE_DCT
%endif
INIT_XMM sse2
DENOISE_DCT
INIT_XMM ssse3
DENOISE_DCT
INIT_XMM avx
DENOISE_DCT

%endif ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; int decimate_score( dctcoef *dct )
;-----------------------------------------------------------------------------

%macro DECIMATE_MASK 5
%if mmsize==16
%ifdef HIGH_BIT_DEPTH
    movdqa   xmm0, [%3+ 0]
    movdqa   xmm1, [%3+32]
    packssdw xmm0, [%3+16]
    packssdw xmm1, [%3+48]
    ABSW2    xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
%else
    ABSW     xmm0, [%3+ 0], xmm3
    ABSW     xmm1, [%3+16], xmm4
%endif
    packsswb xmm0, xmm1
    pxor     xmm2, xmm2
    pcmpeqb  xmm2, xmm0
    pcmpgtb  xmm0, %4
    pmovmskb %1, xmm2
    pmovmskb %2, xmm0

%else ; mmsize==8
%ifdef HIGH_BIT_DEPTH
    movq      mm0, [%3+ 0]
    movq      mm1, [%3+16]
    movq      mm2, [%3+32]
    movq      mm3, [%3+48]
    packssdw  mm0, [%3+ 8]
    packssdw  mm1, [%3+24]
    packssdw  mm2, [%3+40]
    packssdw  mm3, [%3+56]
%else
    movq      mm0, [%3+ 0]
    movq      mm1, [%3+ 8]
    movq      mm2, [%3+16]
    movq      mm3, [%3+24]
%endif
    ABSW2     mm0, mm1, mm0, mm1, mm6, mm7
    ABSW2     mm2, mm3, mm2, mm3, mm6, mm7
    packsswb  mm0, mm1
    packsswb  mm2, mm3
    pxor      mm4, mm4
    pxor      mm6, mm6
    pcmpeqb   mm4, mm0
    pcmpeqb   mm6, mm2
    pcmpgtb   mm0, %4
    pcmpgtb   mm2, %4
    pmovmskb   %5, mm4
    pmovmskb   %1, mm6
    shl        %1, 8
    or         %1, %5
    pmovmskb   %5, mm0
    pmovmskb   %2, mm2
    shl        %2, 8
    or         %2, %5
%endif
%endmacro

cextern decimate_table4
cextern decimate_table8

%macro DECIMATE4x4 1

;A LUT is faster than bsf on AMD processors.
;This is not true for score64.
cglobal decimate_score%1, 1,3
%ifdef PIC
    lea r10, [decimate_table4]
    lea r11, [decimate_mask_table4]
    %define table r10
    %define mask_table r11
%else
    %define table decimate_table4
    %define mask_table decimate_mask_table4
%endif
    DECIMATE_MASK edx, eax, r0, [pb_1], ecx
    xor   edx, 0xffff
    je   .ret
    test  eax, eax
    jne  .ret9
%if %1==15
    shr   edx, 1
%endif
%if cpuflag(slowctz)
    movzx ecx, dl
    movzx eax, byte [mask_table + rcx]
    cmp   edx, ecx
    je   .ret
    bsr   ecx, ecx
    shr   edx, 1
    shr   edx, cl
    bsf   ecx, edx
    shr   edx, 1
    shr   edx, cl
    add    al, byte [table + rcx]
    add    al, byte [mask_table + rdx]
%else
.loop:
    bsf   ecx, edx
    shr   edx, cl
    add    al, byte [table + rcx]
    shr   edx, 1
    jne  .loop
%endif
.ret:
    RET
.ret9:
    mov   eax, 9
    RET

%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx2
DECIMATE4x4 15
DECIMATE4x4 16
INIT_MMX mmx2, slowctz
DECIMATE4x4 15
DECIMATE4x4 16
%endif
INIT_XMM sse2
DECIMATE4x4 15
DECIMATE4x4 16
INIT_XMM sse2, slowctz
DECIMATE4x4 15
DECIMATE4x4 16
INIT_XMM ssse3
DECIMATE4x4 15
DECIMATE4x4 16
INIT_XMM ssse3, slowctz
DECIMATE4x4 15
DECIMATE4x4 16

%macro DECIMATE8x8 0

%ifdef ARCH_X86_64
cglobal decimate_score64, 1,4
%ifdef PIC
    lea r10, [decimate_table8]
    %define table r10
%else
    %define table decimate_table8
%endif
    mova  m5, [pb_1]
    DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
    test  eax, eax
    jne  .ret9
    DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
    shl   r2d, 16
    or    r1d, r2d
    DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
    shl   r2, 32
    or    eax, r3d
    or    r1, r2
    DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
    shl   r2, 48
    or    r1, r2
    xor   r1, -1
    je   .ret
    add   eax, r3d
    jne  .ret9
.loop:
    bsf   rcx, r1
    shr   r1, cl
    add   al, byte [table + rcx]
    shr   r1, 1
    jne  .loop
.ret:
    REP_RET
.ret9:
    mov   eax, 9
    RET

%else ; ARCH
%if mmsize == 8
cglobal decimate_score64, 1,6
%else
cglobal decimate_score64, 1,5
%endif
    mova  m5, [pb_1]
    DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
    test  r2, r2
    jne  .ret9
    DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
    shl   r4, 16
    or    r3, r4
    DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
    or    r2, r1
    DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
    shl   r1, 16
    or    r4, r1
    xor   r3, -1
    je   .tryret
    xor   r4, -1
.cont:
    add   r0, r2
    jne  .ret9      ;r0 is zero at this point, so we don't need to zero it
.loop:
    bsf   ecx, r3
    test  r3, r3
    je   .largerun
    shrd  r3, r4, cl
    shr   r4, cl
    add   r0b, byte [decimate_table8 + ecx]
    shrd  r3, r4, 1
    shr   r4, 1
    cmp   r0, 6     ;score64's threshold is never higher than 6
    jge  .ret9      ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
    test  r3, r3
    jne  .loop
    test  r4, r4
    jne  .loop
.ret:
    REP_RET
.tryret:
    xor   r4, -1
    jne  .cont
    REP_RET
.ret9:
    mov   eax, 9
    RET
.largerun:
    mov   r3, r4
    xor   r4, r4
    bsf   ecx, r3
    shr   r3, cl
    shr   r3, 1
    jne  .loop
    REP_RET
%endif ; ARCH

%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx2
DECIMATE8x8
%endif
INIT_XMM sse2
DECIMATE8x8
INIT_XMM ssse3
DECIMATE8x8

;-----------------------------------------------------------------------------
; int coeff_last( dctcoef *dct )
;-----------------------------------------------------------------------------

%macro BSR 3
%if cpuflag(lzcnt)
    lzcnt %1, %2
    xor %1, %3
%else
    bsr %1, %2
%endif
%endmacro

%macro LZCOUNT 3
%if cpuflag(lzcnt)
    lzcnt %1, %2
%else
    bsr %1, %2
    xor %1, %3
%endif
%endmacro

%ifdef HIGH_BIT_DEPTH
%macro LAST_MASK 3-4
%if %1 == 4
    movq     mm0, [%3]
    packssdw mm0, [%3+8]
    packsswb mm0, mm0
    pcmpeqb  mm0, mm2
    pmovmskb  %2, mm0
%elif mmsize == 16
    movdqa   xmm0, [%3+ 0]
%if %1 == 8
    packssdw xmm0, [%3+16]
    packsswb xmm0, xmm0
%else
    movdqa   xmm1, [%3+32]
    packssdw xmm0, [%3+16]
    packssdw xmm1, [%3+48]
    packsswb xmm0, xmm1
%endif
    pcmpeqb  xmm0, xmm2
    pmovmskb   %2, xmm0
%elif %1 == 8
    movq     mm0, [%3+ 0]
    movq     mm1, [%3+16]
    packssdw mm0, [%3+ 8]
    packssdw mm1, [%3+24]
    packsswb mm0, mm1
    pcmpeqb  mm0, mm2
    pmovmskb  %2, mm0
%else
    movq     mm0, [%3+ 0]
    movq     mm1, [%3+16]
    packssdw mm0, [%3+ 8]
    packssdw mm1, [%3+24]
    movq     mm3, [%3+32]
    movq     mm4, [%3+48]
    packssdw mm3, [%3+40]
    packssdw mm4, [%3+56]
    packsswb mm0, mm1
    packsswb mm3, mm4
    pcmpeqb  mm0, mm2
    pcmpeqb  mm3, mm2
    pmovmskb  %2, mm0
    pmovmskb  %4, mm3
    shl       %4, 8
    or        %2, %4
%endif
%endmacro

%macro COEFF_LAST4 0
cglobal coeff_last4, 1,3
    pxor mm2, mm2
    LAST_MASK 4, r1d, r0
    xor  r1d, 0xff
    shr  r1d, 4
    BSR  eax, r1d, 0x1f
    RET
%endmacro

INIT_MMX mmx2
COEFF_LAST4
INIT_MMX mmx2, lzcnt
COEFF_LAST4

%macro COEFF_LAST8 0
cglobal coeff_last8, 1,3
    pxor m2, m2
    LAST_MASK 8, r1d, r0
%if mmsize == 16
    xor r1d, 0xffff
    shr r1d, 8
%else
    xor r1d, 0xff
%endif
    BSR eax, r1d, 0x1f
    RET
%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx2
COEFF_LAST8
%endif
INIT_XMM sse2
COEFF_LAST8
INIT_XMM sse2, lzcnt
COEFF_LAST8

%else ; !HIGH_BIT_DEPTH
%macro LAST_MASK 3-4
%if %1 <= 8
    movq     mm0, [%3+ 0]
%if %1 == 4
    packsswb mm0, mm0
%else
    packsswb mm0, [%3+ 8]
%endif
    pcmpeqb  mm0, mm2
    pmovmskb  %2, mm0
%elif mmsize == 16
    movdqa   xmm0, [%3+ 0]
    packsswb xmm0, [%3+16]
    pcmpeqb  xmm0, xmm2
    pmovmskb   %2, xmm0
%else
    movq     mm0, [%3+ 0]
    movq     mm1, [%3+16]
    packsswb mm0, [%3+ 8]
    packsswb mm1, [%3+24]
    pcmpeqb  mm0, mm2
    pcmpeqb  mm1, mm2
    pmovmskb  %2, mm0
    pmovmskb  %4, mm1
    shl       %4, 8
    or        %2, %4
%endif
%endmacro

%macro COEFF_LAST48 0
%ifdef ARCH_X86_64
cglobal coeff_last4, 1,1
    BSR  rax, [r0], 0x3f
    shr  eax, 4
    RET
%else
cglobal coeff_last4, 0,3
    mov   edx, r0mp
    mov   eax, [edx+4]
    xor   ecx, ecx
    test  eax, eax
    cmovz eax, [edx]
    setnz cl
    BSR   eax, eax, 0x1f
    shr   eax, 4
    lea   eax, [eax+ecx*2]
    RET
%endif

cglobal coeff_last8, 1,3
    pxor m2, m2
    LAST_MASK 8, r1d, r0, r2d
    xor r1d, 0xff
    BSR eax, r1d, 0x1f
    RET
%endmacro

INIT_MMX mmx2
COEFF_LAST48
INIT_MMX mmx2, lzcnt
COEFF_LAST48
%endif ; HIGH_BIT_DEPTH

%macro COEFF_LAST 0
cglobal coeff_last15, 1,3
    pxor m2, m2
    LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
    xor r1d, 0xffff
    BSR eax, r1d, 0x1f
    dec eax
    RET

cglobal coeff_last16, 1,3
    pxor m2, m2
    LAST_MASK 16, r1d, r0, r2d
    xor r1d, 0xffff
    BSR eax, r1d, 0x1f
    RET

%ifndef ARCH_X86_64
cglobal coeff_last64, 1, 5-mmsize/16
    pxor m2, m2
    LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
    LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
    shl r3d, 16
    or  r2d, r3d
    xor r2d, -1
    jne .secondhalf
    LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
    LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
    shl r3d, 16
    or  r1d, r3d
    not r1d
    BSR eax, r1d, 0x1f
    RET
.secondhalf:
    BSR eax, r2d, 0x1f
    add eax, 32
    RET
%else
cglobal coeff_last64, 1,4
    pxor m2, m2
    LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
    LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
    LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32
    LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
    shl r2d, 16
    shl r0d, 16
    or  r1d, r2d
    or  r3d, r0d
    shl r3,  32
    or  r1,  r3
    not r1
    BSR rax, r1, 0x3f
    RET
%endif
%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx2
COEFF_LAST
%endif
INIT_XMM sse2
COEFF_LAST
INIT_XMM sse2, lzcnt
COEFF_LAST

;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------

; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
%ifdef WIN64
    DECLARE_REG_TMP 3,1,2,0,4,5,6
%elifdef ARCH_X86_64
    DECLARE_REG_TMP 0,1,2,3,4,5,6
%else
    DECLARE_REG_TMP 6,3,2,1,4,5,0
%endif

%macro COEFF_LEVELRUN 1
cglobal coeff_level_run%1,0,7
    movifnidn t0, r0mp
    movifnidn t1, r1mp
    pxor    m2, m2
    LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
    not    t5d
    shl    t5d, 32-((%1+1)&~1)
    mov    t4d, %1-1
    LZCOUNT t3d, t5d, 0x1f
    xor    t6d, t6d
    add    t5d, t5d
    sub    t4d, t3d
    shl    t5d, t3b
    mov   [t1], t4d
.loop:
    LZCOUNT t3d, t5d, 0x1f
%ifdef HIGH_BIT_DEPTH
    mov    t2d, [t0+t4*4]
    mov   [t1+t6  +4+16*4], t3b
    mov   [t1+t6*4+ 4], t2d
%else
    mov    t2w, [t0+t4*2]
    mov   [t1+t6  +4+16*2], t3b
    mov   [t1+t6*2+ 4], t2w
%endif
    inc    t3d
    shl    t5d, t3b
    inc    t6d
    sub    t4d, t3d
    jge .loop
    REP_RET
%endmacro

INIT_MMX mmx2
%ifndef ARCH_X86_64
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
%endif
COEFF_LEVELRUN 4
COEFF_LEVELRUN 8
INIT_XMM sse2
%ifdef HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_XMM sse2, lzcnt
%ifdef HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_MMX mmx2, lzcnt
COEFF_LEVELRUN 4
COEFF_LEVELRUN 8
x264-snapshot-20120103-2245-stable/common/x86/predict.h0000644000175000001440000001634411700673342021421 0ustar  videolanusers/*****************************************************************************
 * predict.h: x86 intra prediction
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_I386_PREDICT_H
#define X264_I386_PREDICT_H

void x264_predict_16x16_init_mmx ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x16c_init_mmx  ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x8c_init_mmx  ( int cpu, x264_predict_t pf[7] );
void x264_predict_4x4_init_mmx   ( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init_mmx   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );

void x264_predict_16x16_v_mmx2( pixel *src );
void x264_predict_16x16_v_sse2( pixel *src );
void x264_predict_16x16_h_mmx2( pixel *src );
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
void x264_predict_16x16_dc_mmx2( pixel *src );
void x264_predict_16x16_dc_sse2( pixel *src );
void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_top_mmx2( pixel *src );
void x264_predict_16x16_dc_top_sse2( pixel *src );
void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x16c_dc_mmx2( pixel *src );
void x264_predict_8x16c_dc_sse2( uint16_t *src );
void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
void x264_predict_8x16c_v_mmx( uint8_t *src );
void x264_predict_8x16c_v_sse2( uint16_t *src );
void x264_predict_8x16c_h_mmx2( uint8_t *src );
void x264_predict_8x16c_h_sse2( pixel *src );
void x264_predict_8x16c_h_ssse3( uint8_t *src );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmx2( pixel *src );
void x264_predict_8x8c_dc_sse2( uint16_t *src );
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
void x264_predict_8x8c_v_mmx( pixel *src );
void x264_predict_8x8c_v_sse2( uint16_t *src );
void x264_predict_8x8c_h_mmx2( uint8_t *src );
void x264_predict_8x8c_h_sse2( pixel *src );
void x264_predict_8x8c_h_ssse3( uint8_t *src );
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
void x264_predict_4x4_ddl_mmx2( pixel *src );
void x264_predict_4x4_ddl_sse2( uint16_t *src );
void x264_predict_4x4_ddl_avx( uint16_t *src );
void x264_predict_4x4_ddr_mmx2( pixel *src );
void x264_predict_4x4_vl_mmx2( pixel *src );
void x264_predict_4x4_vl_sse2( uint16_t *src );
void x264_predict_4x4_vl_avx( uint16_t *src );
void x264_predict_4x4_vr_mmx2( uint8_t *src );
void x264_predict_4x4_vr_sse2( uint16_t *src );
void x264_predict_4x4_vr_ssse3( pixel *src );
void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src );
void x264_predict_4x4_vr_avx( uint16_t *src );
void x264_predict_4x4_hd_mmx2( pixel *src );
void x264_predict_4x4_hd_sse2( uint16_t *src );
void x264_predict_4x4_hd_ssse3( pixel *src );
void x264_predict_4x4_hd_avx( uint16_t *src );
void x264_predict_4x4_dc_mmx2( pixel *src );
void x264_predict_4x4_ddr_sse2( uint16_t *src );
void x264_predict_4x4_ddr_ssse3( pixel *src );
void x264_predict_4x4_ddr_avx( uint16_t *src );
void x264_predict_4x4_hu_mmx2( pixel *src );

#endif
x264-snapshot-20120103-2245-stable/common/x86/predict-c.c0000644000175000001440000004357311700673342021640 0ustar  videolanusers/*****************************************************************************
 * predict-c.c: intra prediction
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "predict.h"
#include "pixel.h"

#define PREDICT_16x16_DC(name)\
void x264_predict_16x16_dc_##name( pixel *src )\
{\
    uint32_t dc = 16;\
    for( int i = 0; i < 16; i += 2 )\
    {\
        dc += src[-1 + i * FDEC_STRIDE];\
        dc += src[-1 + (i+1) * FDEC_STRIDE];\
    }\
    x264_predict_16x16_dc_core_##name( src, dc );\
}

PREDICT_16x16_DC( mmx2 )
PREDICT_16x16_DC( sse2 )

#define PREDICT_16x16_DC_LEFT(name)\
static void x264_predict_16x16_dc_left_##name( pixel *src )\
{\
    uint32_t dc = 8;\
    for( int i = 0; i < 16; i += 2 )\
    {\
        dc += src[-1 + i * FDEC_STRIDE];\
        dc += src[-1 + (i+1) * FDEC_STRIDE];\
    }\
    x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\
}

PREDICT_16x16_DC_LEFT( mmx2 )
PREDICT_16x16_DC_LEFT( sse2 )

#define PREDICT_P_SUM(j,i)\
    H += i * ( src[j+i - FDEC_STRIDE ]  - src[j-i - FDEC_STRIDE ] );\
    V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\

ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
ALIGNED_16( static const int16_t pw_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};

#if !HIGH_BIT_DEPTH
#define PREDICT_16x16_P(name)\
static void x264_predict_16x16_p_##name( pixel *src )\
{\
    int a, b, c;\
    int H = 0;\
    int V = 0;\
    int i00;\
    PREDICT_P_SUM(7,1) \
    PREDICT_P_SUM(7,2) \
    PREDICT_P_SUM(7,3) \
    PREDICT_P_SUM(7,4) \
    PREDICT_P_SUM(7,5) \
    PREDICT_P_SUM(7,6) \
    PREDICT_P_SUM(7,7) \
    PREDICT_P_SUM(7,8) \
    a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
    b = ( 5 * H + 32 ) >> 6;\
    c = ( 5 * V + 32 ) >> 6;\
    i00 = a - b * 7 - c * 7 + 16;\
    x264_predict_16x16_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
PREDICT_16x16_P( mmx2 )
#endif
PREDICT_16x16_P( sse2   )
PREDICT_16x16_P( avx    )
#endif //!HIGH_BIT_DEPTH

#if HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
static void x264_predict_16x16_p_sse2( uint16_t *src )
#else
static void x264_predict_16x16_p_ssse3( uint8_t *src )
#endif
{
    int a, b, c, i00;
    int H, V;
#if HIGH_BIT_DEPTH
    asm (
        "movdqu           %1, %%xmm1 \n"
        "movdqa           %2, %%xmm0 \n"
        "pmaddwd          %3, %%xmm0 \n"
        "pmaddwd          %4, %%xmm1 \n"
        "paddd        %%xmm1, %%xmm0 \n"
        "movhlps      %%xmm0, %%xmm1 \n"
        "paddd        %%xmm1, %%xmm0 \n"
        "pshuflw $14, %%xmm0, %%xmm1 \n"
        "paddd        %%xmm1, %%xmm0 \n"
        "movd         %%xmm0, %0     \n"
        :"=r"(H)
        :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),
         "m"(*pw_12345678), "m"(*pw_m87654321)
    );
#else
    asm (
        "movq           %1, %%mm1 \n"
        "movq           %2, %%mm0 \n"
        "palignr $7,    %3, %%mm1 \n"
        "pmaddubsw      %4, %%mm0 \n"
        "pmaddubsw      %5, %%mm1 \n"
        "paddw       %%mm1, %%mm0 \n"
        "pshufw $14, %%mm0, %%mm1 \n"
        "paddw       %%mm1, %%mm0 \n"
        "pshufw  $1, %%mm0, %%mm1 \n"
        "paddw       %%mm1, %%mm0 \n"
        "movd        %%mm0, %0    \n"
        "movswl        %w0, %0    \n"
        :"=r"(H)
        :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),
         "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)
    );
#endif
    V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
      + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
      + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
      + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )
      + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )
      + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )
      + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )
      + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
    a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
    b = ( 5 * H + 32 ) >> 6;
    c = ( 5 * V + 32 ) >> 6;
    i00 = a - b * 7 - c * 7 + 16;
    /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case
     * than to try to consider it in the asm. */
    if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )
        x264_predict_16x16_p_c( src );
    else
        x264_predict_16x16_p_core_sse2( src, i00, b, c );
}
#endif

#if !HIGH_BIT_DEPTH

#define PREDICT_8x8_P(name)\
static void x264_predict_8x8c_p_##name( uint8_t *src )\
{\
    int a, b, c;\
    int H = 0;\
    int V = 0;\
    int i00;\
    PREDICT_P_SUM(3,1)\
    PREDICT_P_SUM(3,2)\
    PREDICT_P_SUM(3,3)\
    PREDICT_P_SUM(3,4)\
    a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
    b = ( 17 * H + 16 ) >> 5;\
    c = ( 17 * V + 16 ) >> 5;\
    i00 = a -3*b -3*c + 16;\
    x264_predict_8x8c_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
PREDICT_8x8_P( mmx2 )
#endif
PREDICT_8x8_P( sse2   )

#endif //!HIGH_BIT_DEPTH

#if HAVE_X86_INLINE_ASM

#define PREDICT_8x8C_P_CORE\
    V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
      + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
      + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
      + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
    H += -4 * src[-1*FDEC_STRIDE -1];\
    int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
    int b = ( 17 * H + 16 ) >> 5;\
    int c = ( 17 * V + 16 ) >> 5;

#if HIGH_BIT_DEPTH
#define PREDICT_8x8_P2(cpu1, cpu2)\
static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
{\
    int H, V;\
    asm (\
        "movdqa           %1, %%xmm0 \n"\
        "pmaddwd          %2, %%xmm0 \n"\
        "movhlps      %%xmm0, %%xmm1 \n"\
        "paddd        %%xmm1, %%xmm0 \n"\
        "pshuflw $14, %%xmm0, %%xmm1 \n"\
        "paddd        %%xmm1, %%xmm0 \n"\
        "movd         %%xmm0, %0     \n"\
        :"=r"(H)\
        :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)\
    );\
    PREDICT_8x8C_P_CORE\
    x264_predict_8x8c_p_core_ ## cpu2( src, a, b, c );\
}

PREDICT_8x8_P2(sse2, sse2)
PREDICT_8x8_P2( avx,  avx)

#else  //!HIGH_BIT_DEPTH
#define PREDICT_8x8_P2(cpu1, cpu2)\
static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
{\
    int H, V;\
    asm (\
        "movq           %1, %%mm0 \n"\
        "pmaddubsw      %2, %%mm0 \n"\
        "pshufw $14, %%mm0, %%mm1 \n"\
        "paddw       %%mm1, %%mm0 \n"\
        "pshufw  $1, %%mm0, %%mm1 \n"\
        "paddw       %%mm1, %%mm0 \n"\
        "movd        %%mm0, %0    \n"\
        "movswl        %w0, %0    \n"\
        :"=r"(H)\
        :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)\
    );\
    PREDICT_8x8C_P_CORE\
    int i00 = a -3*b -3*c + 16;\
    x264_predict_8x8c_p_core_ ## cpu2( src, i00, b, c );\
}

PREDICT_8x8_P2(ssse3, sse2)
PREDICT_8x8_P2(  avx,  avx)
#endif
#endif

#if ARCH_X86_64 && !HIGH_BIT_DEPTH
static void x264_predict_8x8c_dc_left( uint8_t *src )
{
    int y;
    uint32_t s0 = 0, s1 = 0;
    uint64_t dc0, dc1;

    for( y = 0; y < 4; y++ )
    {
        s0 += src[y * FDEC_STRIDE     - 1];
        s1 += src[(y+4) * FDEC_STRIDE - 1];
    }
    dc0 = (( s0 + 2 ) >> 2) * 0x0101010101010101ULL;
    dc1 = (( s1 + 2 ) >> 2) * 0x0101010101010101ULL;

    for( y = 0; y < 4; y++ )
    {
        M64( src ) = dc0;
        src += FDEC_STRIDE;
    }
    for( y = 0; y < 4; y++ )
    {
        M64( src ) = dc1;
        src += FDEC_STRIDE;
    }

}
#endif // ARCH_X86_64 && !HIGH_BIT_DEPTH

/****************************************************************************
 * Exported functions:
 ****************************************************************************/
void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
{
    if( !(cpu&X264_CPU_MMX2) )
        return;
    pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_mmx2;
    pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_mmx2;
    pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmx2;
    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_mmx2;
    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_mmx2;
#if HIGH_BIT_DEPTH
    if( !(cpu&X264_CPU_SSE2) )
        return;
    pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_sse2;
    pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_sse2;
    pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_sse2;
    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_sse2;
#if HAVE_X86_INLINE_ASM
    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_sse2;
#endif
#else
#if !ARCH_X86_64
    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_mmx2;
#endif
    if( !(cpu&X264_CPU_SSE2) )
        return;
    pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_sse2;
    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_sse2;
    if( cpu&X264_CPU_SSE2_IS_SLOW )
        return;
    pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_sse2;
    pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_sse2;
    if( !(cpu&X264_CPU_SSSE3) )
        return;
    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_ssse3;
#if HAVE_X86_INLINE_ASM
    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_ssse3;
#endif
    if( !(cpu&X264_CPU_AVX) )
        return;
    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_avx;
#endif // HIGH_BIT_DEPTH
}

void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
{
    if( !(cpu&X264_CPU_MMX) )
        return;
#if HIGH_BIT_DEPTH
    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_mmx;
    if( !(cpu&X264_CPU_MMX2) )
        return;
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_mmx2;
    if( !(cpu&X264_CPU_SSE2) )
        return;
    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_sse2;
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_sse2;
    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_sse2;
    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_sse2;
#if HAVE_X86_INLINE_ASM
    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_sse2;
    if( !(cpu&X264_CPU_AVX) )
        return;
    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_avx;
#endif
#else
#if ARCH_X86_64
    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
#endif
    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_mmx;
    if( !(cpu&X264_CPU_MMX2) )
        return;
    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_mmx2;
    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_mmx2;
#if !ARCH_X86_64
    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_mmx2;
#endif
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_mmx2;
    if( !(cpu&X264_CPU_SSE2) )
        return;
    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_sse2;
    if( !(cpu&X264_CPU_SSSE3) )
        return;
    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_ssse3;
#if HAVE_X86_INLINE_ASM
    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_ssse3;
    if( !(cpu&X264_CPU_AVX) )
        return;
    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_avx;
#endif
#endif // HIGH_BIT_DEPTH
}

void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
{
    if( !(cpu&X264_CPU_MMX) )
        return;
#if HIGH_BIT_DEPTH
    if( !(cpu&X264_CPU_MMX2) )
        return;
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x16c_dc_mmx2;
    if( !(cpu&X264_CPU_SSE2) )
        return;
    pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_sse2;
    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_sse2;
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x16c_dc_sse2;
    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_sse2;
#else
    pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_mmx;
    if( !(cpu&X264_CPU_MMX2) )
        return;
    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_mmx2;
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x16c_dc_mmx2;
    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_mmx2;
    if( !(cpu&X264_CPU_SSSE3) )
        return;
    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_ssse3;
#endif // HIGH_BIT_DEPTH
}

void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
{
    if( !(cpu&X264_CPU_MMX2) )
        return;
#if HIGH_BIT_DEPTH
    if( !(cpu&X264_CPU_SSE2) )
        return;
    pf[I_PRED_8x8_V]      = x264_predict_8x8_v_sse2;
    pf[I_PRED_8x8_H]      = x264_predict_8x8_h_sse2;
    pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_sse2;
    pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2;
    pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2;
    pf[I_PRED_8x8_DDL]    = x264_predict_8x8_ddl_sse2;
    pf[I_PRED_8x8_DDR]    = x264_predict_8x8_ddr_sse2;
    pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_sse2;
    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_sse2;
    pf[I_PRED_8x8_HU]     = x264_predict_8x8_hu_sse2;
    *predict_8x8_filter   = x264_predict_8x8_filter_sse2;
    if( !(cpu&X264_CPU_SSSE3) )
        return;
    pf[I_PRED_8x8_DDL]    = x264_predict_8x8_ddl_ssse3;
    pf[I_PRED_8x8_DDR]    = x264_predict_8x8_ddr_ssse3;
    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_ssse3;
    pf[I_PRED_8x8_HU]     = x264_predict_8x8_hu_ssse3;
    pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_ssse3;
    *predict_8x8_filter   = x264_predict_8x8_filter_ssse3;
    if( cpu&X264_CPU_CACHELINE_64 )
    {
        pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_ssse3_cache64;
        pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_ssse3_cache64;
    }
    if( !(cpu&X264_CPU_AVX) )
        return;
    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_avx;
    pf[I_PRED_8x8_HU]     = x264_predict_8x8_hu_avx;
    pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_avx;
    *predict_8x8_filter   = x264_predict_8x8_filter_avx;
#else
    pf[I_PRED_8x8_V]      = x264_predict_8x8_v_mmx2;
    pf[I_PRED_8x8_H]      = x264_predict_8x8_h_mmx2;
    pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_mmx2;
    pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2;
    pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2;
    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_mmx2;
    *predict_8x8_filter   = x264_predict_8x8_filter_mmx2;
#if ARCH_X86
    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_mmx2;
    pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_mmx2;
    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_mmx2;
    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_mmx2;
#endif
    if( !(cpu&X264_CPU_SSE2) )
        return;
    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_sse2;
    pf[I_PRED_8x8_VL]   = x264_predict_8x8_vl_sse2;
    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_sse2;
    pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_sse2;
    pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_sse2;
    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_sse2;
    if( !(cpu&X264_CPU_SSSE3) )
        return;
    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_ssse3;
    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_ssse3;
    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_ssse3;
    *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
    if( !(cpu&X264_CPU_AVX) )
        return;
    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_avx;
    pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_avx;
    pf[I_PRED_8x8_VL]   = x264_predict_8x8_vl_avx;
    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_avx;
    pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_avx;
#endif // HIGH_BIT_DEPTH
}

void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
{
    if( !(cpu&X264_CPU_MMX2) )
        return;
    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_mmx2;
    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmx2;
    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmx2;
    pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_mmx2;
    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_mmx2;
    pf[I_PRED_4x4_HU]  = x264_predict_4x4_hu_mmx2;
#if HIGH_BIT_DEPTH
    if( !(cpu&X264_CPU_SSE2) )
        return;
    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_sse2;
    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_sse2;
    pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_sse2;
    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_sse2;
    if( !(cpu&X264_CPU_SSSE3) )
        return;
    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
    if( !(cpu&X264_CPU_AVX) )
        return;
    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx;
    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx;
    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_avx;
    pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_avx;
    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_avx;
#else
    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmx2;
    if( !(cpu&X264_CPU_SSSE3) )
        return;
    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
    if( cpu&X264_CPU_CACHELINE_64 )
        pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3_cache64;
#endif // HIGH_BIT_DEPTH
}
x264-snapshot-20120103-2245-stable/common/x86/predict-a.asm0000644000175000001440000016220011700673342022161 0ustar  videolanusers;*****************************************************************************
;* predict-a.asm: x86 intra prediction
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Holger Lubitz <holger@lubitz.org>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA

pw_76543210:
pw_3210:     dw 0, 1, 2, 3, 4, 5, 6, 7
pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3:       times 8 dw -3
pb_00s_ff:   times 8 db 0
pb_0s_ff:    times 7 db 0
             db 0xff
shuf_fixtr:  db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
shuf_nop:    db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
shuf_hu:     db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
shuf_vr:     db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
pw_reverse:  db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1

SECTION .text

cextern pb_0
cextern pb_1
cextern pb_3
cextern pw_1
cextern pw_2
cextern pw_4
cextern pw_8
cextern pw_16
cextern pw_00ff
cextern pw_pixel_max

%macro STORE8x8 2-4
    add r0, 4*FDEC_STRIDEB
    mova        [r0 + -4*FDEC_STRIDEB], %1
    mova        [r0 + -3*FDEC_STRIDEB], %1
    mova        [r0 + -2*FDEC_STRIDEB], %1
    mova        [r0 + -1*FDEC_STRIDEB], %1
    mova        [r0 +  0*FDEC_STRIDEB], %2
    mova        [r0 +  1*FDEC_STRIDEB], %2
    mova        [r0 +  2*FDEC_STRIDEB], %2
    mova        [r0 +  3*FDEC_STRIDEB], %2
%endmacro

%macro STORE8x16 4
    add r0, 4*FDEC_STRIDEB
    mova        [r0 + -4*FDEC_STRIDEB], %1
    mova        [r0 + -3*FDEC_STRIDEB], %1
    mova        [r0 + -2*FDEC_STRIDEB], %1
    mova        [r0 + -1*FDEC_STRIDEB], %1
    add r0, 4*FDEC_STRIDEB
    mova        [r0 + -4*FDEC_STRIDEB], %2
    mova        [r0 + -3*FDEC_STRIDEB], %2
    mova        [r0 + -2*FDEC_STRIDEB], %2
    mova        [r0 + -1*FDEC_STRIDEB], %2
    add r0, 4*FDEC_STRIDEB
    mova        [r0 + -4*FDEC_STRIDEB], %3
    mova        [r0 + -3*FDEC_STRIDEB], %3
    mova        [r0 + -2*FDEC_STRIDEB], %3
    mova        [r0 + -1*FDEC_STRIDEB], %3
    mova        [r0 +  0*FDEC_STRIDEB], %4
    mova        [r0 +  1*FDEC_STRIDEB], %4
    mova        [r0 +  2*FDEC_STRIDEB], %4
    mova        [r0 +  3*FDEC_STRIDEB], %4
%endmacro

%macro STORE16x16 2-4
%ifidn %0, 4
    mov         r1d, 8
.loop:
    mova        [r0 + 0*FDEC_STRIDEB + 0], %1
    mova        [r0 + 1*FDEC_STRIDEB + 0], %1
    mova        [r0 + 0*FDEC_STRIDEB + 8], %2
    mova        [r0 + 1*FDEC_STRIDEB + 8], %2
    mova        [r0 + 0*FDEC_STRIDEB +16], %3
    mova        [r0 + 1*FDEC_STRIDEB +16], %3
    mova        [r0 + 0*FDEC_STRIDEB +24], %4
    mova        [r0 + 1*FDEC_STRIDEB +24], %4
    add         r0, 2*FDEC_STRIDEB
    dec         r1d
    jg          .loop
%else
    mov         r1d, 4
.loop:
    mova        [r0 + 0*FDEC_STRIDE], %1
    mova        [r0 + 1*FDEC_STRIDE], %1
    mova        [r0 + 2*FDEC_STRIDE], %1
    mova        [r0 + 3*FDEC_STRIDE], %1
    mova        [r0 + 0*FDEC_STRIDE + 8], %2
    mova        [r0 + 1*FDEC_STRIDE + 8], %2
    mova        [r0 + 2*FDEC_STRIDE + 8], %2
    mova        [r0 + 3*FDEC_STRIDE + 8], %2
    add         r0, 4*FDEC_STRIDE
    dec         r1d
    jg          .loop
%endif
%endmacro

%macro STORE16x16_SSE2 1-2
%ifidn %0,2
    mov r1d, 4
.loop
    mova      [r0+0*FDEC_STRIDEB+ 0], %1
    mova      [r0+0*FDEC_STRIDEB+16], %2
    mova      [r0+1*FDEC_STRIDEB+ 0], %1
    mova      [r0+1*FDEC_STRIDEB+16], %2
    mova      [r0+2*FDEC_STRIDEB+ 0], %1
    mova      [r0+2*FDEC_STRIDEB+16], %2
    mova      [r0+3*FDEC_STRIDEB+ 0], %1
    mova      [r0+3*FDEC_STRIDEB+16], %2
    add       r0, 4*FDEC_STRIDEB
    dec       r1d
    jg        .loop
%else
    add r0, 4*FDEC_STRIDEB
    mova      [r0 + -4*FDEC_STRIDEB], %1
    mova      [r0 + -3*FDEC_STRIDEB], %1
    mova      [r0 + -2*FDEC_STRIDEB], %1
    mova      [r0 + -1*FDEC_STRIDEB], %1
    mova      [r0 +  0*FDEC_STRIDEB], %1
    mova      [r0 +  1*FDEC_STRIDEB], %1
    mova      [r0 +  2*FDEC_STRIDEB], %1
    mova      [r0 +  3*FDEC_STRIDEB], %1
    add r0, 8*FDEC_STRIDEB
    mova      [r0 + -4*FDEC_STRIDEB], %1
    mova      [r0 + -3*FDEC_STRIDEB], %1
    mova      [r0 + -2*FDEC_STRIDEB], %1
    mova      [r0 + -1*FDEC_STRIDEB], %1
    mova      [r0 +  0*FDEC_STRIDEB], %1
    mova      [r0 +  1*FDEC_STRIDEB], %1
    mova      [r0 +  2*FDEC_STRIDEB], %1
    mova      [r0 +  3*FDEC_STRIDEB], %1
%endif
%endmacro

; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS 4-5
%ifdef HIGH_BIT_DEPTH
    paddw       %2, %3
    psrlw       %2, 1
    pavgw       %1, %4, %2
%else
    mova        %5, %2
    pavgb       %2, %3
    pxor        %3, %5
    pand        %3, [pb_1]
    psubusb     %2, %3
    pavgb       %1, %4, %2
%endif
%endmacro

;-----------------------------------------------------------------------------
; void predict_4x4_ddl( pixel *src )
;-----------------------------------------------------------------------------
%macro PREDICT_4x4_DDL 0
cglobal predict_4x4_ddl, 1,1
    movu    m1, [r0-FDEC_STRIDEB]
    PSLLPIX m2, m1, 1
    mova    m0, m1
%ifdef HIGH_BIT_DEPTH
    PSRLPIX m1, m1, 1
    pshufhw m1, m1, q2210
%else
    pxor    m1, m2
    PSRLPIX m1, m1, 1
    pxor    m1, m0
%endif
    PRED8x8_LOWPASS m0, m2, m1, m0, m3

%assign Y 0
%rep 4
    PSRLPIX m0, m0, 1
    movh   [r0+Y*FDEC_STRIDEB], m0
%assign Y (Y+1)
%endrep

    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_4x4_DDL
INIT_XMM avx
PREDICT_4x4_DDL
INIT_MMX mmx2
cglobal predict_4x4_ddl, 1,2
    movu    m1, [r0-FDEC_STRIDEB+4]
    PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
    mova    m3, [r0-FDEC_STRIDEB+8]
    mova    [r0+0*FDEC_STRIDEB], m0
    pshufw  m4, m3, q3321
    PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
    mova    [r0+3*FDEC_STRIDEB], m2
    pshufw  m1, m0, q0021
    punpckldq m1, m2
    mova    [r0+1*FDEC_STRIDEB], m1
    psllq   m0, 16
    PALIGNR m2, m0, 6, m0
    mova    [r0+2*FDEC_STRIDEB], m2
    RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
PREDICT_4x4_DDL
%endif

;-----------------------------------------------------------------------------
; void predict_4x4_vr( pixel *src )
;-----------------------------------------------------------------------------
%ifndef HIGH_BIT_DEPTH
INIT_MMX ssse3
cglobal predict_4x4_vr, 1,1
    movd    m1, [r0-1*FDEC_STRIDEB]        ; ........t3t2t1t0
    mova    m4, m1
    palignr m1, [r0-1*FDEC_STRIDEB-8], 7   ; ......t3t2t1t0lt
    pavgb   m4, m1
    palignr m1, [r0+0*FDEC_STRIDEB-8], 7   ; ....t3t2t1t0ltl0
    mova    m0, m1
    palignr m1, [r0+1*FDEC_STRIDEB-8], 7   ; ..t3t2t1t0ltl0l1
    mova    m2, m1
    palignr m1, [r0+2*FDEC_STRIDEB-8], 7   ; t3t2t1t0ltl0l1l2
    PRED8x8_LOWPASS m2, m0, m1, m2, m3
    pshufw  m0, m2, 0
    psrlq   m2, 16
    movd    [r0+0*FDEC_STRIDEB], m4
    palignr m4, m0, 7
    movd    [r0+1*FDEC_STRIDEB], m2
    psllq   m0, 8
    movd    [r0+2*FDEC_STRIDEB], m4
    palignr m2, m0, 7
    movd    [r0+3*FDEC_STRIDEB], m2
    RET
%endif ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void predict_4x4_ddr( pixel *src )
;-----------------------------------------------------------------------------
%macro PREDICT_4x4 4
cglobal predict_4x4_ddr, 1,1
%ifdef HIGH_BIT_DEPTH
    movu      m2, [r0-1*FDEC_STRIDEB-8]
    pinsrw    m2, [r0+0*FDEC_STRIDEB-2], 2
    pinsrw    m2, [r0+1*FDEC_STRIDEB-2], 1
    pinsrw    m2, [r0+2*FDEC_STRIDEB-2], 0
    movhps    m3, [r0+3*FDEC_STRIDEB-8]
%else ; !HIGH_BIT_DEPTH
    movd      m0, [r0+2*FDEC_STRIDEB-4]
    movd      m1, [r0+0*FDEC_STRIDEB-4]
    punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
    punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
    punpckhwd m0, m1
    movd      m2, [r0-1*FDEC_STRIDEB]
%if cpuflag(ssse3)
    palignr   m2, m0, 4
%else
    psllq     m2, 32
    punpckhdq m0, m2
    SWAP       2, 0
%endif
    movd      m3, [r0+3*FDEC_STRIDEB-4]
    psllq     m3, 32
%endif ; !HIGH_BIT_DEPTH

    PSRLPIX   m1, m2, 1
    mova      m0, m2
    PALIGNR   m2, m3, 7*SIZEOF_PIXEL, m3
    PRED8x8_LOWPASS m0, m2, m1, m0, m3
%assign Y 3
    movh      [r0+Y*FDEC_STRIDEB], m0
%rep 3
%assign Y (Y-1)
    PSRLPIX   m0, m0, 1
    movh      [r0+Y*FDEC_STRIDEB], m0
%endrep
    RET

;-----------------------------------------------------------------------------
; void predict_4x4_vr( pixel *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_vr, 1,1
%ifdef HIGH_BIT_DEPTH
    movu      m1, [r0-1*FDEC_STRIDEB-8]
    pinsrw    m1, [r0+0*FDEC_STRIDEB-2], 2
    pinsrw    m1, [r0+1*FDEC_STRIDEB-2], 1
    pinsrw    m1, [r0+2*FDEC_STRIDEB-2], 0
%else ; !HIGH_BIT_DEPTH
    movd      m0, [r0+2*FDEC_STRIDEB-4]
    movd      m1, [r0+0*FDEC_STRIDEB-4]
    punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
    punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
    punpckhwd m0, m1
    movd      m1, [r0-1*FDEC_STRIDEB]
%if cpuflag(ssse3)
    palignr   m1, m0, 4
%else
    psllq     m1, 32
    punpckhdq m0, m1
    SWAP       1, 0
%endif
%endif ; !HIGH_BIT_DEPTH
    PSRLPIX   m2, m1, 1
    PSRLPIX   m0, m1, 2
    pavg%1    m4, m1, m2
    PSRLPIX   m4, m4, 3
    PRED8x8_LOWPASS m2, m0, m1, m2, m3
    PSLLPIX   m0, m2, 6
    PSRLPIX   m2, m2, 2
    movh      [r0+0*FDEC_STRIDEB], m4
    PALIGNR   m4, m0, 7*SIZEOF_PIXEL, m3
    movh      [r0+1*FDEC_STRIDEB], m2
    PSLLPIX   m0, m0, 1
    movh      [r0+2*FDEC_STRIDEB], m4
    PALIGNR   m2, m0, 7*SIZEOF_PIXEL, m0
    movh      [r0+3*FDEC_STRIDEB], m2
    RET

;-----------------------------------------------------------------------------
; void predict_4x4_hd( pixel *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_hd, 1,1
%ifdef HIGH_BIT_DEPTH
    movu      m1, [r0-1*FDEC_STRIDEB-8]
    PSLLPIX   m1, m1, 1
    pinsrw    m1, [r0+0*FDEC_STRIDEB-2], 3
    pinsrw    m1, [r0+1*FDEC_STRIDEB-2], 2
    pinsrw    m1, [r0+2*FDEC_STRIDEB-2], 1
    pinsrw    m1, [r0+3*FDEC_STRIDEB-2], 0
%else
    movd      m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
    punpckldq m0, [r0-1*FDEC_STRIDEB]   ; t3 t2 t1 t0 lt .. .. ..
    PSLLPIX   m0, m0, 1                 ; t2 t1 t0 lt .. .. .. ..
    movd      m1, [r0+3*FDEC_STRIDEB-4] ; l3
    punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
    movd      m2, [r0+1*FDEC_STRIDEB-4] ; l1
    punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
    punpckh%3 m1, m2                    ; l0 l1 l2 l3
    punpckh%4 m1, m0                    ; t2 t1 t0 lt l0 l1 l2 l3
%endif
    PSRLPIX   m2, m1, 1                 ; .. t2 t1 t0 lt l0 l1 l2
    PSRLPIX   m0, m1, 2                 ; .. .. t2 t1 t0 lt l0 l1
    pavg%1    m5, m1, m2
    PRED8x8_LOWPASS m3, m1, m0, m2, m4
    punpckl%2 m5, m3
    PSRLPIX   m3, m3, 4
    PALIGNR   m3, m5, 6*SIZEOF_PIXEL, m4
%assign Y 3
    movh      [r0+Y*FDEC_STRIDEB], m5
%rep 2
%assign Y (Y-1)
    PSRLPIX   m5, m5, 2
    movh      [r0+Y*FDEC_STRIDEB], m5
%endrep
    movh      [r0+0*FDEC_STRIDEB], m3
    RET
%endmacro ; PREDICT_4x4

;-----------------------------------------------------------------------------
; void predict_4x4_ddr( pixel *src )
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX mmx2
cglobal predict_4x4_ddr, 1,1
    mova      m0, [r0+1*FDEC_STRIDEB-8]
    punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
    mova      m3, [r0+3*FDEC_STRIDEB-8]
    punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
    punpckhdq m3, m0

    pshufw  m0, m3, q3321
    pinsrw  m0, [r0-1*FDEC_STRIDEB-2], 3
    pshufw  m1, m0, q3321
    PRED8x8_LOWPASS m0, m1, m3, m0
    movq    [r0+3*FDEC_STRIDEB], m0

    movq    m2, [r0-1*FDEC_STRIDEB-0]
    pshufw  m4, m2, q2100
    pinsrw  m4, [r0-1*FDEC_STRIDEB-2], 0
    movq    m1, m4
    PALIGNR m4, m3, 6, m3
    PRED8x8_LOWPASS m1, m4, m2, m1
    movq    [r0+0*FDEC_STRIDEB], m1

    pshufw  m2, m0, q3321
    punpckldq m2, m1
    psllq   m0, 16
    PALIGNR m1, m0, 6, m0
    movq    [r0+1*FDEC_STRIDEB], m1
    movq    [r0+2*FDEC_STRIDEB], m2
    movd    [r0+3*FDEC_STRIDEB+4], m1
    RET

;-----------------------------------------------------------------------------
; void predict_4x4_hd( pixel *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_hd, 1,1
    mova      m0, [r0+1*FDEC_STRIDEB-8]
    punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
    mova      m1, [r0+3*FDEC_STRIDEB-8]
    punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
    punpckhdq m1, m0
    mova      m0, m1

    movu      m3, [r0-1*FDEC_STRIDEB-2]
    pshufw    m4, m1, q0032
    mova      m7, m3
    punpckldq m4, m3
    PALIGNR   m3, m1, 2, m2
    PRED8x8_LOWPASS m2, m4, m1, m3

    pavgw     m0, m3
    punpcklwd m5, m0, m2
    punpckhwd m4, m0, m2
    mova      [r0+3*FDEC_STRIDEB], m5
    mova      [r0+1*FDEC_STRIDEB], m4
    psrlq     m5, 32
    punpckldq m5, m4
    mova      [r0+2*FDEC_STRIDEB], m5

    pshufw    m4, m7, q2100
    mova      m6, [r0-1*FDEC_STRIDEB+0]
    pinsrw    m4, [r0+0*FDEC_STRIDEB-2], 0
    PRED8x8_LOWPASS m3, m4, m6, m7
    PALIGNR   m3, m0, 6, m0
    mova      [r0+0*FDEC_STRIDEB], m3
    RET

INIT_XMM sse2
PREDICT_4x4 w, wd, dq, qdq
INIT_XMM ssse3
PREDICT_4x4 w, wd, dq, qdq
INIT_XMM avx
PREDICT_4x4 w, wd, dq, qdq
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
PREDICT_4x4 b, bw, wd, dq
INIT_MMX ssse3
%define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64
PREDICT_4x4 b, bw, wd, dq
%endif

;-----------------------------------------------------------------------------
; void predict_4x4_hu( pixel *src )
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_hu_mmx2, 1,1
    movq      m0, [r0+0*FDEC_STRIDEB-8]
    punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
    movq      m1, [r0+2*FDEC_STRIDEB-8]
    punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
    punpckhdq m0, m1
    pshufw    m1, m1, q3333
    movq      [r0+3*FDEC_STRIDEB], m1
    pshufw    m3, m0, q3321
    pshufw    m4, m0, q3332
    pavgw     m2, m0, m3
    PRED8x8_LOWPASS m3, m0, m4, m3
    punpcklwd m4, m2, m3
    mova      [r0+0*FDEC_STRIDEB], m4
    psrlq     m2, 16
    psrlq     m3, 16
    punpcklwd m2, m3
    mova      [r0+1*FDEC_STRIDEB], m2
    punpckhdq m2, m1
    mova      [r0+2*FDEC_STRIDEB], m2
    RET

%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_hu_mmx2, 1,1
    movd      m1, [r0+0*FDEC_STRIDEB-4]
    punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
    movd      m0, [r0+2*FDEC_STRIDEB-4]
    punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
    punpckhwd m1, m0
    movq      m0, m1
    punpckhbw m1, m1
    pshufw    m1, m1, q3333
    punpckhdq m0, m1
    movq      m2, m0
    movq      m3, m0
    movq      m5, m0
    psrlq     m3, 8
    psrlq     m2, 16
    pavgb     m5, m3
    PRED8x8_LOWPASS m3, m0, m2, m3, m4
    movd      [r0+3*FDEC_STRIDEB], m1
    punpcklbw m5, m3
    movd      [r0+0*FDEC_STRIDEB], m5
    psrlq     m5, 16
    movd      [r0+1*FDEC_STRIDEB], m5
    psrlq     m5, 16
    movd      [r0+2*FDEC_STRIDEB], m5
    RET
%endif ; HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void predict_4x4_vl( pixel *src )
;-----------------------------------------------------------------------------
%macro PREDICT_4x4_V1 1
cglobal predict_4x4_vl, 1,1
    movu        m1, [r0-FDEC_STRIDEB]
    PSRLPIX     m3, m1, 1
    PSRLPIX     m2, m1, 2
    pavg%1      m4, m3, m1
    PRED8x8_LOWPASS m0, m1, m2, m3, m5

    movh        [r0+0*FDEC_STRIDEB], m4
    movh        [r0+1*FDEC_STRIDEB], m0
    PSRLPIX     m4, m4, 1
    PSRLPIX     m0, m0, 1
    movh        [r0+2*FDEC_STRIDEB], m4
    movh        [r0+3*FDEC_STRIDEB], m0
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_4x4_V1 w
INIT_XMM avx
PREDICT_4x4_V1 w

INIT_MMX mmx2
cglobal predict_4x4_vl, 1,4
    mova    m1, [r0-FDEC_STRIDEB+0]
    mova    m2, [r0-FDEC_STRIDEB+8]
    mova    m0, m2
    PALIGNR m2, m1, 4, m4
    PALIGNR m0, m1, 2, m4
    mova    m3, m0
    pavgw   m3, m1
    mova    [r0+0*FDEC_STRIDEB], m3
    psrlq   m3, 16
    mova    [r0+2*FDEC_STRIDEB], m3
    PRED8x8_LOWPASS m0, m1, m2, m0
    mova    [r0+1*FDEC_STRIDEB], m0
    psrlq   m0, 16
    mova    [r0+3*FDEC_STRIDEB], m0

    movzx   r1d, word [r0-FDEC_STRIDEB+ 8]
    movzx   r2d, word [r0-FDEC_STRIDEB+10]
    movzx   r3d, word [r0-FDEC_STRIDEB+12]
    lea     r1d, [r1+r2+1]
    add     r3d, r2d
    lea     r3d, [r3+r1+1]
    shr     r1d, 1
    shr     r3d, 2
    mov     [r0+2*FDEC_STRIDEB+6], r1w
    mov     [r0+3*FDEC_STRIDEB+6], r3w
    RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
PREDICT_4x4_V1 b
%endif

;-----------------------------------------------------------------------------
; void predict_4x4_dc( pixel *src )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
%ifdef HIGH_BIT_DEPTH
cglobal predict_4x4_dc, 1,1
    mova   m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
    paddw  m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
    paddw  m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
    paddw  m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
    psrlq  m2, 48
    mova   m0, [r0-FDEC_STRIDEB]
    HADDW  m0, m1
    paddw  m0, [pw_4]
    paddw  m0, m2
    psrlw  m0, 3
    SPLATW m0, m0
    mova   [r0+0*FDEC_STRIDEB], m0
    mova   [r0+1*FDEC_STRIDEB], m0
    mova   [r0+2*FDEC_STRIDEB], m0
    mova   [r0+3*FDEC_STRIDEB], m0
    RET

%else ; !HIGH_BIT_DEPTH
cglobal predict_4x4_dc, 1,4
    pxor   mm7, mm7
    movd   mm0, [r0-FDEC_STRIDEB]
    psadbw mm0, mm7
    movd   r3d, mm0
    movzx  r1d, byte [r0-1]
%assign Y 1
%rep 3
    movzx  r2d, byte [r0+FDEC_STRIDEB*Y-1]
    add    r1d, r2d
%assign Y Y+1
%endrep
    lea    r1d, [r1+r3+4]
    shr    r1d, 3
    imul   r1d, 0x01010101
    mov   [r0+FDEC_STRIDEB*0], r1d
    mov   [r0+FDEC_STRIDEB*1], r1d
    mov   [r0+FDEC_STRIDEB*2], r1d
    mov   [r0+FDEC_STRIDEB*3], r1d
    RET
%endif ; HIGH_BIT_DEPTH

%macro PREDICT_FILTER 4
;-----------------------------------------------------------------------------
;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
;-----------------------------------------------------------------------------
cglobal predict_8x8_filter, 4,6,6
    add          r0, 0x58*SIZEOF_PIXEL
%define src r0-0x58*SIZEOF_PIXEL
%ifndef ARCH_X86_64
    mov          r4, r1
%define t1 r4
%define t4 r1
%else
%define t1 r1
%define t4 r4
%endif
    test       r3b, 1
    je .check_top
    mov        t4d, r2d
    and        t4d, 8
    neg         t4
    mova        m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
    mova        m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    punpckh%2%3 m1, m0
    mova        m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    mova        m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    punpckh%2%3 m3, m2
    punpckh%3%4 m3, m1
    mova        m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    mova        m1, [src-1*FDEC_STRIDEB]
    PALIGNR     m4, m3, m0, 7*SIZEOF_PIXEL, m0
    PALIGNR     m1, m1, m3, 1*SIZEOF_PIXEL, m2
    PRED8x8_LOWPASS m3, m1, m4, m3, m5
    mova        [t1+8*SIZEOF_PIXEL], m3
    movzx      t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
    movzx      r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
    lea        t4d, [t4*3+2]
    add        t4d, r5d
    shr        t4d, 2
    mov         [t1+7*SIZEOF_PIXEL], t4%1
    mov         [t1+6*SIZEOF_PIXEL], t4%1
    test       r3b, 2
    je .done
.check_top:
%if SIZEOF_PIXEL==1 && cpuflag(ssse3)
INIT_XMM cpuname
    movu        m3, [src-1*FDEC_STRIDEB]
    movhps      m0, [src-1*FDEC_STRIDEB-8]
    test       r2b, 8
    je .fix_lt_2
.do_top:
    and        r2d, 4
%ifdef PIC
    lea         r3, [shuf_fixtr]
    pshufb      m3, [r3+r2*4]
%else
    pshufb      m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
%endif
    psrldq      m1, m3, 15
    PALIGNR     m2, m3, m0, 15, m0
    PALIGNR     m1, m3, 1, m5
    PRED8x8_LOWPASS m0, m2, m1, m3, m5
    mova        [t1+16*SIZEOF_PIXEL], m0
    psrldq      m0, 15
    movd        [t1+32*SIZEOF_PIXEL], m0
.done:
    REP_RET
.fix_lt_2:
    pslldq      m0, m3, 15
    jmp .do_top

%else
    mova        m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
    mova        m3, [src-1*FDEC_STRIDEB]
    mova        m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
    test       r2b, 8
    je .fix_lt_2
    test       r2b, 4
    je .fix_tr_1
.do_top:
    PALIGNR     m2, m3, m0, 7*SIZEOF_PIXEL, m0
    PALIGNR     m0, m1, m3, 1*SIZEOF_PIXEL, m5
    PRED8x8_LOWPASS m4, m2, m0, m3, m5
    mova        [t1+16*SIZEOF_PIXEL], m4
    test       r3b, 4
    je .done
    PSRLPIX     m5, m1, 7
    PALIGNR     m2, m1, m3, 7*SIZEOF_PIXEL, m3
    PALIGNR     m5, m1, 1*SIZEOF_PIXEL, m4
    PRED8x8_LOWPASS m0, m2, m5, m1, m4
    mova        [t1+24*SIZEOF_PIXEL], m0
    PSRLPIX     m0, m0, 7
    movd        [t1+32*SIZEOF_PIXEL], m0
.done:
    REP_RET
.fix_lt_2:
    PSLLPIX     m0, m3, 7
    test       r2b, 4
    jne .do_top
.fix_tr_1:
    punpckh%1%2 m1, m3, m3
    pshuf%2     m1, m1, q3333
    jmp .do_top
%endif
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_FILTER w, d, q, dq
INIT_XMM ssse3
PREDICT_FILTER w, d, q, dq
INIT_XMM avx
PREDICT_FILTER w, d, q, dq
%else
INIT_MMX mmx2
PREDICT_FILTER b, w, d, q
INIT_MMX ssse3
PREDICT_FILTER b, w, d, q
%endif

;-----------------------------------------------------------------------------
; void predict_8x8_v( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_V 0
cglobal predict_8x8_v, 2,2
    mova        m0, [r1+16*SIZEOF_PIXEL]
    STORE8x8    m0, m0
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_V
%else
INIT_MMX mmx2
PREDICT_8x8_V
%endif

;-----------------------------------------------------------------------------
; void predict_8x8_h( pixel *src, pixel edge[36] )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_H 2
cglobal predict_8x8_h, 2,2
    movu      m1, [r1+7*SIZEOF_PIXEL]
    add       r0, 4*FDEC_STRIDEB
    punpckl%1 m2, m1, m1
    punpckh%1 m1, m1
%assign Y 0
%rep 8
%assign i 1+Y/4
    SPLAT%2 m0, m %+ i, (3-Y)&3
    mova [r0+(Y-4)*FDEC_STRIDEB], m0
%assign Y Y+1
%endrep
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_H wd, D
%else
INIT_MMX mmx2
PREDICT_8x8_H bw, W
%endif

;-----------------------------------------------------------------------------
; void predict_8x8_dc( pixel *src, pixel *edge );
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
cglobal predict_8x8_dc, 2,2
    movu        m0, [r1+14]
    paddw       m0, [r1+32]
    HADDW       m0, m1
    paddw       m0, [pw_8]
    psrlw       m0, 4
    SPLATW      m0, m0
    STORE8x8    m0, m0
    REP_RET

%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
cglobal predict_8x8_dc, 2,2
    pxor        mm0, mm0
    pxor        mm1, mm1
    psadbw      mm0, [r1+7]
    psadbw      mm1, [r1+16]
    paddw       mm0, [pw_8]
    paddw       mm0, mm1
    psrlw       mm0, 4
    pshufw      mm0, mm0, 0
    packuswb    mm0, mm0
    STORE8x8    mm0, mm0
    RET
%endif ; HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void predict_8x8_dc_top ( pixel *src, pixel *edge );
; void predict_8x8_dc_left( pixel *src, pixel *edge );
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
%macro PREDICT_8x8_DC 3
cglobal %1, 2,2
    %3          m0, [r1+%2]
    HADDW       m0, m1
    paddw       m0, [pw_4]
    psrlw       m0, 3
    SPLATW      m0, m0
    STORE8x8    m0, m0
    RET
%endmacro
INIT_XMM sse2
PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
PREDICT_8x8_DC predict_8x8_dc_left, 14, movu

%else ; !HIGH_BIT_DEPTH
%macro PREDICT_8x8_DC 2
cglobal %1, 2,2
    pxor        mm0, mm0
    psadbw      mm0, [r1+%2]
    paddw       mm0, [pw_4]
    psrlw       mm0, 3
    pshufw      mm0, mm0, 0
    packuswb    mm0, mm0
    STORE8x8    mm0, mm0
    RET
%endmacro
INIT_MMX
PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
%endif ; HIGH_BIT_DEPTH

; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
; size on the 8-bit mmx functions below if we know sse2 is available.
%macro PREDICT_8x8_DDLR 0
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddl, 2,2,7
    mova        m0, [r1+16*SIZEOF_PIXEL]
    mova        m1, [r1+24*SIZEOF_PIXEL]
%if cpuflag(cache64)
    movd        m5, [r1+32*SIZEOF_PIXEL]
    palignr     m3, m1, m0, 1*SIZEOF_PIXEL
    palignr     m5, m5, m1, 1*SIZEOF_PIXEL
    palignr     m4, m1, m0, 7*SIZEOF_PIXEL
%else
    movu        m3, [r1+17*SIZEOF_PIXEL]
    movu        m4, [r1+23*SIZEOF_PIXEL]
    movu        m5, [r1+25*SIZEOF_PIXEL]
%endif
    PSLLPIX     m2, m0, 1
    add         r0, FDEC_STRIDEB*4
    PRED8x8_LOWPASS m0, m2, m3, m0, m6
    PRED8x8_LOWPASS m1, m4, m5, m1, m6
    mova        [r0+3*FDEC_STRIDEB], m1
%assign Y 2
%rep 6
    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m2
    PSLLPIX     m0, m0, 1
    mova        [r0+Y*FDEC_STRIDEB], m1
%assign Y (Y-1)
%endrep
    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m0
    mova        [r0+Y*FDEC_STRIDEB], m1
    RET

;-----------------------------------------------------------------------------
; void predict_8x8_ddr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddr, 2,2,7
    add         r0, FDEC_STRIDEB*4
    mova        m0, [r1+ 8*SIZEOF_PIXEL]
    mova        m1, [r1+16*SIZEOF_PIXEL]
    ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
    movu        m2, [r1+ 7*SIZEOF_PIXEL]
    movu        m5, [r1+17*SIZEOF_PIXEL]
%if cpuflag(cache64)
    palignr     m3, m1, m0, 1*SIZEOF_PIXEL
    palignr     m4, m1, m0, 7*SIZEOF_PIXEL
%else
    movu        m3, [r1+ 9*SIZEOF_PIXEL]
    movu        m4, [r1+15*SIZEOF_PIXEL]
%endif
    PRED8x8_LOWPASS m0, m2, m3, m0, m6
    PRED8x8_LOWPASS m1, m4, m5, m1, m6
    mova        [r0+3*FDEC_STRIDEB], m0
%assign Y -4
%rep 6
    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m2
    PSLLPIX     m0, m0, 1
    mova        [r0+Y*FDEC_STRIDEB], m1
%assign Y (Y+1)
%endrep
    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m0
    mova        [r0+Y*FDEC_STRIDEB], m1
    RET
%endmacro ; PREDICT_8x8_DDLR

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_DDLR
INIT_XMM ssse3
PREDICT_8x8_DDLR
INIT_XMM ssse3, cache64
PREDICT_8x8_DDLR
%elifndef ARCH_X86_64
INIT_MMX mmx2
PREDICT_8x8_DDLR
%endif

;-----------------------------------------------------------------------------
; void predict_8x8_hu( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HU 2
cglobal predict_8x8_hu, 2,2,8
    add       r0, 4*FDEC_STRIDEB
%ifdef HIGH_BIT_DEPTH
%if cpuflag(ssse3)
    movu      m5, [r1+7*SIZEOF_PIXEL]
    pshufb    m5, [pw_reverse]
%else
    movq      m6, [r1+7*SIZEOF_PIXEL]
    movq      m5, [r1+11*SIZEOF_PIXEL]
    pshuflw   m6, m6, q0123
    pshuflw   m5, m5, q0123
    movlhps   m5, m6
%endif ; cpuflag
    psrldq    m2, m5, 2
    pshufd    m3, m5, q0321
    pshufhw   m2, m2, q2210
    pshufhw   m3, m3, q1110
    pavgw     m4, m5, m2
%else ; !HIGH_BIT_DEPTH
    movu      m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
    pshufw    m0, m1, q0123           ; l6 l7 l4 l5 l2 l3 l0 l1
    psllq     m1, 56                  ; l7 .. .. .. .. .. .. ..
    mova      m2, m0
    psllw     m0, 8
    psrlw     m2, 8
    por       m2, m0
    mova      m3, m2
    mova      m4, m2
    mova      m5, m2                  ; l7 l6 l5 l4 l3 l2 l1 l0
    psrlq     m3, 16
    psrlq     m2, 8
    por       m2, m1                  ; l7 l7 l6 l5 l4 l3 l2 l1
    punpckhbw m1, m1
    por       m3, m1                  ; l7 l7 l7 l6 l5 l4 l3 l2
    pavgb     m4, m2
%endif ; !HIGH_BIT_DEPTH
    PRED8x8_LOWPASS m2, m3, m5, m2, m6
    punpckh%2 m0, m4, m2              ; p8 p7 p6 p5
    punpckl%2 m4, m2                  ; p4 p3 p2 p1
    PALIGNR   m5, m0, m4, 2*SIZEOF_PIXEL, m3
    pshuf%1   m1, m0, q3321
    PALIGNR   m6, m0, m4, 4*SIZEOF_PIXEL, m3
    pshuf%1   m2, m0, q3332
    PALIGNR   m7, m0, m4, 6*SIZEOF_PIXEL, m3
    pshuf%1   m3, m0, q3333
    mova      [r0-4*FDEC_STRIDEB], m4
    mova      [r0-3*FDEC_STRIDEB], m5
    mova      [r0-2*FDEC_STRIDEB], m6
    mova      [r0-1*FDEC_STRIDEB], m7
    mova      [r0+0*FDEC_STRIDEB], m0
    mova      [r0+1*FDEC_STRIDEB], m1
    mova      [r0+2*FDEC_STRIDEB], m2
    mova      [r0+3*FDEC_STRIDEB], m3
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_HU d, wd
INIT_XMM ssse3
PREDICT_8x8_HU d, wd
INIT_XMM avx
PREDICT_8x8_HU d, wd
%elifndef ARCH_X86_64
INIT_MMX mmx2
PREDICT_8x8_HU w, bw
%endif

;-----------------------------------------------------------------------------
; void predict_8x8_vr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_VR 1
cglobal predict_8x8_vr, 2,3
    mova        m2, [r1+16*SIZEOF_PIXEL]
%ifidn cpuname, ssse3
    mova        m0, [r1+8*SIZEOF_PIXEL]
    palignr     m3, m2, m0, 7*SIZEOF_PIXEL
    palignr     m1, m2, m0, 6*SIZEOF_PIXEL
%else
    movu        m3, [r1+15*SIZEOF_PIXEL]
    movu        m1, [r1+14*SIZEOF_PIXEL]
%endif
    pavg%1      m4, m3, m2
    add         r0, FDEC_STRIDEB*4
    PRED8x8_LOWPASS m3, m1, m2, m3, m5
    mova        [r0-4*FDEC_STRIDEB], m4
    mova        [r0-3*FDEC_STRIDEB], m3
    mova        m1, [r1+8*SIZEOF_PIXEL]
    PSLLPIX     m0, m1, 1
    PSLLPIX     m2, m1, 2
    PRED8x8_LOWPASS m0, m1, m2, m0, m6

%assign Y -2
%rep 5
    PALIGNR     m4, m0, 7*SIZEOF_PIXEL, m5
    mova        [r0+Y*FDEC_STRIDEB], m4
    PSLLPIX     m0, m0, 1
    SWAP 3, 4
%assign Y (Y+1)
%endrep
    PALIGNR     m4, m0, 7*SIZEOF_PIXEL, m0
    mova        [r0+Y*FDEC_STRIDEB], m4
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_VR w
INIT_XMM ssse3
PREDICT_8x8_VR w
INIT_XMM avx
PREDICT_8x8_VR w
%elifndef ARCH_X86_64
INIT_MMX mmx2
PREDICT_8x8_VR b
%endif

%macro LOAD_PLANE_ARGS 0
%ifdef ARCH_X86_64
    movd        mm0, r1d
    movd        mm2, r2d
    movd        mm4, r3d
    pshufw      mm0, mm0, 0
    pshufw      mm2, mm2, 0
    pshufw      mm4, mm4, 0
%else
    pshufw      mm0, r1m, 0
    pshufw      mm2, r2m, 0
    pshufw      mm4, r3m, 0
%endif
%endmacro

;-----------------------------------------------------------------------------
; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX
cglobal predict_8x8c_p_core_mmx2, 1,2
    LOAD_PLANE_ARGS
    movq        mm1, mm2
    pmullw      mm2, [pw_3210]
    psllw       mm1, 2
    paddsw      mm0, mm2        ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
    paddsw      mm1, mm0        ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}

    mov         r1d, 8
ALIGN 4
.loop:
    movq        mm5, mm0
    movq        mm6, mm1
    psraw       mm5, 5
    psraw       mm6, 5
    packuswb    mm5, mm6
    movq        [r0], mm5

    paddsw      mm0, mm4
    paddsw      mm1, mm4
    add         r0, FDEC_STRIDE
    dec         r1d
    jg          .loop
    REP_RET
%endif ; !ARCH_X86_64

%macro PREDICT_8x8C 0
%ifdef HIGH_BIT_DEPTH
cglobal predict_8x8c_p_core, 1,1,7
    movd        m0, r1m
    movd        m2, r2m
    movd        m4, r3m
    mova        m3, [pw_pixel_max]
    pxor        m1, m1
    SPLATW      m0, m0, 0
    SPLATW      m2, m2, 0
    SPLATW      m4, m4, 0
    pmullw      m2, [pw_43210123] ; b
    pmullw      m5, m4, [pw_m3]   ; c
    paddw       m5, [pw_16]
    mov        r1d, 8
.loop:
    paddsw      m6, m2, m5
    paddsw      m6, m0
    psraw       m6, 5
    CLIPW       m6, m1, m3
    mova      [r0], m6
    paddw       m5, m4
    add         r0, FDEC_STRIDEB
    dec r1d
    jg .loop
    REP_RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_8x8c_p_core, 1,1
    movd        m0, r1m
    movd        m2, r2m
    movd        m4, r3m
    SPLATW      m0, m0, 0
    SPLATW      m2, m2, 0
    SPLATW      m4, m4, 0
    pmullw      m2, [pw_76543210]
    paddsw      m0, m2            ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
    paddsw      m3, m0, m4
    paddsw      m4, m4
call .loop
    add         r0, FDEC_STRIDE*4
.loop:
    paddsw      m1, m3, m4
    paddsw      m5, m0, m4
    psraw       m3, 5
    psraw       m0, 5
    packuswb    m0, m3
    movq        [r0+FDEC_STRIDE*0], m0
    movhps      [r0+FDEC_STRIDE*1], m0
    paddsw      m0, m5, m4
    paddsw      m3, m1, m4
    psraw       m5, 5
    psraw       m1, 5
    packuswb    m5, m1
    movq        [r0+FDEC_STRIDE*2], m5
    movhps      [r0+FDEC_STRIDE*3], m5
    RET
%endif ; HIGH_BIT_DEPTH
%endmacro

INIT_XMM sse2
PREDICT_8x8C
INIT_XMM avx
PREDICT_8x8C

;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX mmx2
cglobal predict_16x16_p_core, 1,2
    LOAD_PLANE_ARGS
    movq        mm5, mm2
    movq        mm1, mm2
    pmullw      mm5, [pw_3210]
    psllw       mm2, 3
    psllw       mm1, 2
    movq        mm3, mm2
    paddsw      mm0, mm5        ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
    paddsw      mm1, mm0        ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
    paddsw      mm2, mm0        ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
    paddsw      mm3, mm1        ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}

    mov         r1d, 16
ALIGN 4
.loop:
    movq        mm5, mm0
    movq        mm6, mm1
    psraw       mm5, 5
    psraw       mm6, 5
    packuswb    mm5, mm6
    movq        [r0], mm5

    movq        mm5, mm2
    movq        mm6, mm3
    psraw       mm5, 5
    psraw       mm6, 5
    packuswb    mm5, mm6
    movq        [r0+8], mm5

    paddsw      mm0, mm4
    paddsw      mm1, mm4
    paddsw      mm2, mm4
    paddsw      mm3, mm4
    add         r0, FDEC_STRIDE
    dec         r1d
    jg          .loop
    REP_RET
%endif ; !ARCH_X86_64

%macro PREDICT_16x16_P 0
cglobal predict_16x16_p_core, 1,2,8
    movd     m0, r1m
    movd     m1, r2m
    movd     m2, r3m
    SPLATW   m0, m0, 0
    SPLATW   m1, m1, 0
    SPLATW   m2, m2, 0
    pmullw   m3, m1, [pw_76543210]
    psllw    m1, 3
%ifdef HIGH_BIT_DEPTH
    pxor     m6, m6
    mov     r1d, 16
.loop:
    mova     m4, m0
    mova     m5, m0
    mova     m7, m3
    paddsw   m7, m6
    paddsw   m4, m7
    paddsw   m7, m1
    paddsw   m5, m7
    psraw    m4, 5
    psraw    m5, 5
    CLIPW    m4, [pb_0], [pw_pixel_max]
    CLIPW    m5, [pb_0], [pw_pixel_max]
    mova   [r0], m4
    mova [r0+16], m5
    add      r0, FDEC_STRIDEB
    paddw    m6, m2
    dec      r1d
    jg       .loop
%else ; !HIGH_BIT_DEPTH
    paddsw   m0, m3  ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
    paddsw   m1, m0  ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
    paddsw   m7, m2, m2
    mov     r1d, 8
ALIGN 4
.loop:
    psraw    m3, m0, 5
    psraw    m4, m1, 5
    paddsw   m5, m0, m2
    paddsw   m6, m1, m2
    psraw    m5, 5
    psraw    m6, 5
    packuswb m3, m4
    packuswb m5, m6
    mova [r0+FDEC_STRIDE*0], m3
    mova [r0+FDEC_STRIDE*1], m5
    paddsw   m0, m7
    paddsw   m1, m7
    add      r0, FDEC_STRIDE*2
    dec      r1d
    jg       .loop
%endif ; !HIGH_BIT_DEPTH
    REP_RET
%endmacro ; PREDICT_16x16_P

INIT_XMM sse2
PREDICT_16x16_P
%ifndef HIGH_BIT_DEPTH
INIT_XMM avx
PREDICT_16x16_P
%endif

%ifndef HIGH_BIT_DEPTH
%macro PREDICT_8x8 0
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddl, 2,2
    mova        m0, [r1+16]
%ifidn cpuname, ssse3
    movd        m2, [r1+32]
    palignr     m2, m0, 1
%else
    movu        m2, [r1+17]
%endif
    pslldq      m1, m0, 1
    add        r0, FDEC_STRIDE*4
    PRED8x8_LOWPASS m0, m1, m2, m0, m3

%assign Y -4
%rep 8
    psrldq      m0, 1
    movq        [r0+Y*FDEC_STRIDE], m0
%assign Y (Y+1)
%endrep
    RET

%ifnidn cpuname, ssse3
;-----------------------------------------------------------------------------
; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddr, 2,2
    movu        m0, [r1+8]
    movu        m1, [r1+7]
    psrldq      m2, m0, 1
    add         r0, FDEC_STRIDE*4
    PRED8x8_LOWPASS m0, m1, m2, m0, m3

    psrldq      m1, m0, 1
%assign Y 3
%rep 3
    movq        [r0+Y*FDEC_STRIDE], m0
    movq        [r0+(Y-1)*FDEC_STRIDE], m1
    psrldq      m0, 2
    psrldq      m1, 2
%assign Y (Y-2)
%endrep
    movq        [r0-3*FDEC_STRIDE], m0
    movq        [r0-4*FDEC_STRIDE], m1
    RET

;-----------------------------------------------------------------------------
; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_vl, 2,2
    mova        m0, [r1+16]
    pslldq      m1, m0, 1
    psrldq      m2, m0, 1
    pavgb       m3, m0, m2
    add         r0, FDEC_STRIDE*4
    PRED8x8_LOWPASS m0, m1, m2, m0, m5
; m0: (t0 + 2*t1 + t2 + 2) >> 2
; m3: (t0 + t1 + 1) >> 1

%assign Y -4
%rep 3
    psrldq      m0, 1
    movq        [r0+ Y   *FDEC_STRIDE], m3
    movq        [r0+(Y+1)*FDEC_STRIDE], m0
    psrldq      m3, 1
%assign Y (Y+2)
%endrep
    psrldq      m0, 1
    movq        [r0+ Y   *FDEC_STRIDE], m3
    movq        [r0+(Y+1)*FDEC_STRIDE], m0
    RET
%endif ; !ssse3

;-----------------------------------------------------------------------------
; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_vr, 2,2
    movu        m2, [r1+8]
    add         r0, 4*FDEC_STRIDE
    pslldq      m1, m2, 2
    pslldq      m0, m2, 1
    pavgb       m3, m2, m0
    PRED8x8_LOWPASS m0, m2, m1, m0, m4
    movhps      [r0-4*FDEC_STRIDE], m3
    movhps      [r0-3*FDEC_STRIDE], m0
%if cpuflag(ssse3)
    movhlps     m3, m3
    pshufb      m0, [shuf_vr]
    palignr     m3, m0, 13
%else
    mova        m2, m0
    mova        m1, [pw_00ff]
    pand        m1, m0
    psrlw       m0, 8
    packuswb    m1, m0
    pslldq      m1, 4
    movhlps     m3, m1
    shufps      m1, m2, q3210
    psrldq      m3, 5
    psrldq      m1, 5
    SWAP         0, 1
%endif
    movq        [r0+3*FDEC_STRIDE], m0
    movq        [r0+2*FDEC_STRIDE], m3
    psrldq      m0, 1
    psrldq      m3, 1
    movq        [r0+1*FDEC_STRIDE], m0
    movq        [r0+0*FDEC_STRIDE], m3
    psrldq      m0, 1
    psrldq      m3, 1
    movq        [r0-1*FDEC_STRIDE], m0
    movq        [r0-2*FDEC_STRIDE], m3
    RET
%endmacro ; PREDICT_8x8

INIT_XMM sse2
PREDICT_8x8
INIT_XMM ssse3
PREDICT_8x8
INIT_XMM avx
PREDICT_8x8

%endif ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void predict_8x8_hd( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HD 2
cglobal predict_8x8_hd, 2,2
    add       r0, 4*FDEC_STRIDEB
    mova      m0, [r1+ 8*SIZEOF_PIXEL]     ; lt l0 l1 l2 l3 l4 l5 l6
    movu      m1, [r1+ 7*SIZEOF_PIXEL]     ; l0 l1 l2 l3 l4 l5 l6 l7
%ifidn cpuname, ssse3
    mova      m2, [r1+16*SIZEOF_PIXEL]     ; t7 t6 t5 t4 t3 t2 t1 t0
    mova      m4, m2                       ; t7 t6 t5 t4 t3 t2 t1 t0
    palignr   m2, m0, 7*SIZEOF_PIXEL       ; t6 t5 t4 t3 t2 t1 t0 lt
    palignr   m4, m0, 1*SIZEOF_PIXEL       ; t0 lt l0 l1 l2 l3 l4 l5
%else
    movu      m2, [r1+15*SIZEOF_PIXEL]
    movu      m4, [r1+ 9*SIZEOF_PIXEL]
%endif ; cpuflag
    pavg%1    m3, m0, m1
    PRED8x8_LOWPASS m0, m4, m1, m0, m5
    PSRLPIX   m4, m2, 2                    ; .. .. t6 t5 t4 t3 t2 t1
    PSRLPIX   m1, m2, 1                    ; .. t6 t5 t4 t3 t2 t1 t0
    PRED8x8_LOWPASS m1, m4, m2, m1, m5
                                           ; .. p11 p10 p9
    punpckh%2 m2, m3, m0                   ; p8 p7 p6 p5
    punpckl%2 m3, m0                       ; p4 p3 p2 p1
    mova      [r0+3*FDEC_STRIDEB], m3
    PALIGNR   m0, m2, m3, 2*SIZEOF_PIXEL, m5
    mova      [r0+2*FDEC_STRIDEB], m0
    PALIGNR   m0, m2, m3, 4*SIZEOF_PIXEL, m5
    mova      [r0+1*FDEC_STRIDEB], m0
    PALIGNR   m0, m2, m3, 6*SIZEOF_PIXEL, m3
    mova      [r0+0*FDEC_STRIDEB], m0
    mova      [r0-1*FDEC_STRIDEB], m2
    PALIGNR   m0, m1, m2, 2*SIZEOF_PIXEL, m5
    mova      [r0-2*FDEC_STRIDEB], m0
    PALIGNR   m0, m1, m2, 4*SIZEOF_PIXEL, m5
    mova      [r0-3*FDEC_STRIDEB], m0
    PALIGNR   m1, m1, m2, 6*SIZEOF_PIXEL, m2
    mova      [r0-4*FDEC_STRIDEB], m1
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_HD w, wd
INIT_XMM ssse3
PREDICT_8x8_HD w, wd
INIT_XMM avx
PREDICT_8x8_HD w, wd
%else
INIT_MMX mmx2
PREDICT_8x8_HD b, bw

;-----------------------------------------------------------------------------
; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HD 0
cglobal predict_8x8_hd, 2,2
    add     r0, 4*FDEC_STRIDE
    movu    m1, [r1+7]
    movu    m3, [r1+8]
    movu    m2, [r1+9]
    pavgb   m4, m1, m3
    PRED8x8_LOWPASS m0, m1, m2, m3, m5
    punpcklbw m4, m0
    movhlps m0, m4

%assign Y 3
%rep 3
    movq   [r0+(Y)*FDEC_STRIDE], m4
    movq   [r0+(Y-4)*FDEC_STRIDE], m0
    psrldq m4, 2
    psrldq m0, 2
%assign Y (Y-1)
%endrep
    movq   [r0+(Y)*FDEC_STRIDE], m4
    movq   [r0+(Y-4)*FDEC_STRIDE], m0
    RET
%endmacro

INIT_XMM sse2
PREDICT_8x8_HD
INIT_XMM avx
PREDICT_8x8_HD
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal predict_8x8_hu_sse2, 2,2
    add        r0, 4*FDEC_STRIDE
    movq      mm1, [r1+7]           ; l0 l1 l2 l3 l4 l5 l6 l7
    pshufw    mm0, mm1, q0123       ; l6 l7 l4 l5 l2 l3 l0 l1
    movq      mm2, mm0
    psllw     mm0, 8
    psrlw     mm2, 8
    por       mm2, mm0              ; l7 l6 l5 l4 l3 l2 l1 l0
    psllq     mm1, 56               ; l7 .. .. .. .. .. .. ..
    movq      mm3, mm2
    movq      mm4, mm2
    movq      mm5, mm2
    psrlq     mm2, 8
    psrlq     mm3, 16
    por       mm2, mm1              ; l7 l7 l6 l5 l4 l3 l2 l1
    punpckhbw mm1, mm1
    por       mm3, mm1              ; l7 l7 l7 l6 l5 l4 l3 l2
    pavgb     mm4, mm2
    PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6

    movq2dq   xmm0, mm4
    movq2dq   xmm1, mm1
    punpcklbw xmm0, xmm1
    punpckhbw  mm4, mm1
%assign Y -4
%rep 3
    movq     [r0+Y*FDEC_STRIDE], xmm0
    psrldq    xmm0, 2
%assign Y (Y+1)
%endrep
    pshufw     mm5, mm4, q3321
    pshufw     mm6, mm4, q3332
    pshufw     mm7, mm4, q3333
    movq     [r0+Y*FDEC_STRIDE], xmm0
    movq     [r0+0*FDEC_STRIDE], mm4
    movq     [r0+1*FDEC_STRIDE], mm5
    movq     [r0+2*FDEC_STRIDE], mm6
    movq     [r0+3*FDEC_STRIDE], mm7
    RET

INIT_XMM
cglobal predict_8x8_hu_ssse3, 2,2
    add       r0, 4*FDEC_STRIDE
    movq      m3, [r1+7]
    pshufb    m3, [shuf_hu]
    psrldq    m1, m3, 1
    psrldq    m2, m3, 2
    pavgb     m0, m1, m3
    PRED8x8_LOWPASS m1, m3, m2, m1, m4
    punpcklbw m0, m1
%assign Y -4
%rep 3
    movq   [r0+ Y   *FDEC_STRIDE], m0
    movhps [r0+(Y+4)*FDEC_STRIDE], m0
    psrldq    m0, 2
    pshufhw   m0, m0, q2210
%assign Y (Y+1)
%endrep
    movq   [r0+ Y   *FDEC_STRIDE], m0
    movhps [r0+(Y+4)*FDEC_STRIDE], m0
    RET
%endif ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void predict_8x8c_v( uint8_t *src )
;-----------------------------------------------------------------------------

%macro PREDICT_8x8C_V 0
cglobal predict_8x8c_v, 1,1
    mova        m0, [r0 - FDEC_STRIDEB]
    STORE8x8    m0, m0
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8C_V
%else
INIT_MMX mmx
PREDICT_8x8C_V
%endif

%ifdef HIGH_BIT_DEPTH

INIT_MMX
cglobal predict_8x8c_v_mmx, 1,1
    mova        m0, [r0 - FDEC_STRIDEB]
    mova        m1, [r0 - FDEC_STRIDEB + 8]
%assign Y 0
%rep 8
    mova        [r0 + (Y&1)*FDEC_STRIDEB], m0
    mova        [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
%if (Y&1) && (Y!=7)
    add         r0, FDEC_STRIDEB*2
%endif
%assign Y Y+1
%endrep
    RET

%endif

%macro PREDICT_8x16C_V 0
cglobal predict_8x16c_v, 1,1
    mova        m0, [r0 - FDEC_STRIDEB]
    STORE8x16    m0, m0, m0, m0
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x16C_V
%else
INIT_MMX mmx
PREDICT_8x16C_V
%endif

;-----------------------------------------------------------------------------
; void predict_8x8c_h( uint8_t *src )
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH

INIT_XMM sse2
%macro PREDICT_C_H 1
cglobal predict_8x%1c_h, 1,1
    add        r0, FDEC_STRIDEB*4
%assign Y -4
%rep %1
    movd       m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
    SPLATW     m0, m0, 1
    mova [r0+FDEC_STRIDEB*Y], m0
%assign Y Y+1
%endrep
    RET
%endmacro

PREDICT_C_H 8
PREDICT_C_H 16

%else ; !HIGH_BIT_DEPTH

%macro PREDICT_C_H_CORE 1
%assign Y %1
%rep 4
    SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1
    mova [r0+FDEC_STRIDE*Y], m0
%assign Y Y+1
%endrep
%endmacro

%macro PREDICT_C_H 1
cglobal predict_8x%1c_h, 1,1
%if cpuflag(ssse3)
    mova   m1, [pb_3]
%endif
%if %1==16
    add    r0, FDEC_STRIDE*4
    PREDICT_C_H_CORE -4
    add    r0, FDEC_STRIDE*4
    PREDICT_C_H_CORE -4
%endif
    add    r0, FDEC_STRIDE*4
    PREDICT_C_H_CORE -4
    PREDICT_C_H_CORE 0
    RET
%endmacro

INIT_MMX mmx2
PREDICT_C_H 8
PREDICT_C_H 16
INIT_MMX ssse3
PREDICT_C_H 8
PREDICT_C_H 16

%endif
;-----------------------------------------------------------------------------
; void predict_8x8c_dc( pixel *src )
;-----------------------------------------------------------------------------

%macro LOAD_LEFT 1
    movzx    r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
    add      r1d, r2d
    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
    add      r1d, r2d
    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
    add      r1d, r2d
%endmacro

%macro PREDICT_8x8C_DC 0
cglobal predict_8x8c_dc, 1,3
    pxor      m7, m7
%ifdef HIGH_BIT_DEPTH
    movq      m0, [r0-FDEC_STRIDEB+0]
    movq      m1, [r0-FDEC_STRIDEB+8]
    HADDW     m0, m2
    HADDW     m1, m2
%else ; !HIGH_BIT_DEPTH
    movd      m0, [r0-FDEC_STRIDEB+0]
    movd      m1, [r0-FDEC_STRIDEB+4]
    psadbw    m0, m7            ; s0
    psadbw    m1, m7            ; s1
%endif
    add       r0, FDEC_STRIDEB*4

    LOAD_LEFT 0                 ; s2
    movd      m2, r1d
    LOAD_LEFT 4                 ; s3
    movd      m3, r1d

    punpcklwd m0, m1
    punpcklwd m2, m3
    punpckldq m0, m2            ; s0, s1, s2, s3
    pshufw    m3, m0, q3312     ; s2, s1, s3, s3
    pshufw    m0, m0, q1310     ; s0, s1, s3, s1
    paddw     m0, m3
    psrlw     m0, 2
    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
%ifdef HIGH_BIT_DEPTH
%if cpuflag(sse2)
    movq2dq   xmm0, m0
    punpcklwd xmm0, xmm0
    pshufd    xmm1, xmm0, q3322
    punpckldq xmm0, xmm0
%assign Y 0
%rep 8
%assign i (0 + (Y/4))
    movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
%assign Y Y+1
%endrep
%else ; !sse2
    pshufw    m1, m0, q0000
    pshufw    m2, m0, q1111
    pshufw    m3, m0, q2222
    pshufw    m4, m0, q3333
%assign Y 0
%rep 8
%assign i (1 + (Y/4)*2)
%assign j (2 + (Y/4)*2)
    movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
    movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
%assign Y Y+1
%endrep
%endif
%else ; !HIGH_BIT_DEPTH
    packuswb  m0, m0
    punpcklbw m0, m0
    movq      m1, m0
    punpcklbw m0, m0
    punpckhbw m1, m1
%assign Y 0
%rep 8
%assign i (0 + (Y/4))
    movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
%assign Y Y+1
%endrep
%endif
    RET
%endmacro

INIT_MMX mmx2
PREDICT_8x8C_DC
%ifdef HIGH_BIT_DEPTH
INIT_MMX sse2
PREDICT_8x8C_DC
%endif

%ifdef HIGH_BIT_DEPTH
%macro STORE_4LINES 3
%if cpuflag(sse2)
    movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
    movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
    movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
    movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
%else
    movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
    movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
    movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
    movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
    movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
    movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
    movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
    movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
%endif
%endmacro
%else
%macro STORE_4LINES 2
    movq [r0+FDEC_STRIDEB*(%2-4)], %1
    movq [r0+FDEC_STRIDEB*(%2-3)], %1
    movq [r0+FDEC_STRIDEB*(%2-2)], %1
    movq [r0+FDEC_STRIDEB*(%2-1)], %1
%endmacro
%endif

%macro PREDICT_8x16C_DC 0
cglobal predict_8x16c_dc, 1,3
    pxor      m7, m7
%ifdef HIGH_BIT_DEPTH
    movq      m0, [r0-FDEC_STRIDEB+0]
    movq      m1, [r0-FDEC_STRIDEB+8]
    HADDW     m0, m2
    HADDW     m1, m2
%else
    movd      m0, [r0-FDEC_STRIDEB+0]
    movd      m1, [r0-FDEC_STRIDEB+4]
    psadbw    m0, m7            ; s0
    psadbw    m1, m7            ; s1
%endif
    punpcklwd m0, m1            ; s0, s1

    add       r0, FDEC_STRIDEB*4
    LOAD_LEFT 0                 ; s2
    pinsrw    m0, r1d, 2
    LOAD_LEFT 4                 ; s3
    pinsrw    m0, r1d, 3        ; s0, s1, s2, s3
    add       r0, FDEC_STRIDEB*8
    LOAD_LEFT 0                 ; s4
    pinsrw    m1, r1d, 2
    LOAD_LEFT 4                 ; s5
    pinsrw    m1, r1d, 3        ; s1, __, s4, s5
    sub       r0, FDEC_STRIDEB*8

    pshufw    m2, m0, q1310     ; s0, s1, s3, s1
    pshufw    m0, m0, q3312     ; s2, s1, s3, s3
    pshufw    m3, m1, q0302     ; s4, s1, s5, s1
    pshufw    m1, m1, q3322     ; s4, s4, s5, s5
    paddw     m0, m2
    paddw     m1, m3
    psrlw     m0, 2
    psrlw     m1, 2
    pavgw     m0, m7
    pavgw     m1, m7
%ifdef HIGH_BIT_DEPTH
%if cpuflag(sse2)
    movq2dq xmm0, m0
    movq2dq xmm1, m1
    punpcklwd xmm0, xmm0
    punpcklwd xmm1, xmm1
    pshufd    xmm2, xmm0, q3322
    pshufd    xmm3, xmm1, q3322
    punpckldq xmm0, xmm0
    punpckldq xmm1, xmm1
    STORE_4LINES xmm0, xmm0, 0
    STORE_4LINES xmm2, xmm2, 4
    STORE_4LINES xmm1, xmm1, 8
    STORE_4LINES xmm3, xmm3, 12
%else
    pshufw    m2, m0, q0000
    pshufw    m3, m0, q1111
    pshufw    m4, m0, q2222
    pshufw    m5, m0, q3333
    STORE_4LINES m2, m3, 0
    STORE_4LINES m4, m5, 4
    pshufw    m2, m1, q0000
    pshufw    m3, m1, q1111
    pshufw    m4, m1, q2222
    pshufw    m5, m1, q3333
    STORE_4LINES m2, m3, 8
    STORE_4LINES m4, m5, 12
%endif
%else
    packuswb  m0, m0            ; dc0, dc1, dc2, dc3
    packuswb  m1, m1            ; dc4, dc5, dc6, dc7
    punpcklbw m0, m0
    punpcklbw m1, m1
    pshufw    m2, m0, q1100
    pshufw    m3, m0, q3322
    pshufw    m4, m1, q1100
    pshufw    m5, m1, q3322
    STORE_4LINES m2, 0
    STORE_4LINES m3, 4
    add       r0, FDEC_STRIDEB*8
    STORE_4LINES m4, 0
    STORE_4LINES m5, 4
%endif
    RET
%endmacro

INIT_MMX mmx2
PREDICT_8x16C_DC
%ifdef HIGH_BIT_DEPTH
INIT_MMX sse2
PREDICT_8x16C_DC
%endif

%macro PREDICT_C_DC_TOP 1
%ifdef HIGH_BIT_DEPTH
INIT_XMM
cglobal predict_8x%1c_dc_top_sse2, 1,1
    pxor        m2, m2
    mova        m0, [r0 - FDEC_STRIDEB]
    pshufd      m1, m0, q2301
    paddw       m0, m1
    pshuflw     m1, m0, q2301
    pshufhw     m1, m1, q2301
    paddw       m0, m1
    psrlw       m0, 1
    pavgw       m0, m2
    STORE8x%1   m0, m0, m0, m0
    RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_8x%1c_dc_top_mmx2, 1,1
    movq        mm0, [r0 - FDEC_STRIDE]
    pxor        mm1, mm1
    pxor        mm2, mm2
    punpckhbw   mm1, mm0
    punpcklbw   mm0, mm2
    psadbw      mm1, mm2        ; s1
    psadbw      mm0, mm2        ; s0
    psrlw       mm1, 1
    psrlw       mm0, 1
    pavgw       mm1, mm2
    pavgw       mm0, mm2
    pshufw      mm1, mm1, 0
    pshufw      mm0, mm0, 0     ; dc0 (w)
    packuswb    mm0, mm1        ; dc0,dc1 (b)
    STORE8x%1   mm0, mm0, mm0, mm0
    RET
%endif
%endmacro

PREDICT_C_DC_TOP 8
PREDICT_C_DC_TOP 16

;-----------------------------------------------------------------------------
; void predict_16x16_v( pixel *src )
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_16x16_v_mmx2, 1,2
    mova        m0, [r0 - FDEC_STRIDEB+ 0]
    mova        m1, [r0 - FDEC_STRIDEB+ 8]
    mova        m2, [r0 - FDEC_STRIDEB+16]
    mova        m3, [r0 - FDEC_STRIDEB+24]
    STORE16x16  m0, m1, m2, m3
    REP_RET
INIT_XMM
cglobal predict_16x16_v_sse2, 2,2
    mova      m0, [r0 - FDEC_STRIDEB+ 0]
    mova      m1, [r0 - FDEC_STRIDEB+16]
    STORE16x16_SSE2 m0, m1
    REP_RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_16x16_v_mmx2, 1,2
    movq        m0, [r0 - FDEC_STRIDE + 0]
    movq        m1, [r0 - FDEC_STRIDE + 8]
    STORE16x16  m0, m1
    REP_RET
INIT_XMM
cglobal predict_16x16_v_sse2, 1,1
    movdqa      xmm0, [r0 - FDEC_STRIDE]
    STORE16x16_SSE2 xmm0
    RET
%endif

;-----------------------------------------------------------------------------
; void predict_16x16_h( pixel *src )
;-----------------------------------------------------------------------------
%macro PREDICT_16x16_H 0
cglobal predict_16x16_h, 1,2
    mov r1, 12*FDEC_STRIDEB
%ifdef HIGH_BIT_DEPTH
.vloop:
%assign Y 0
%rep 4
    movd        m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL]
    SPLATW      m0, m0, 1
    mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0
    mova [r0+r1+Y*FDEC_STRIDEB+16], m0
%if mmsize==8
    mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0
    mova [r0+r1+Y*FDEC_STRIDEB+24], m0
%endif
%assign Y Y+1
%endrep

%else ; !HIGH_BIT_DEPTH
%if cpuflag(ssse3)
    mova   m1, [pb_3]
%endif
.vloop:
%assign Y 0
%rep 4
    SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1
    mova [r0+r1+FDEC_STRIDE*Y], m0
%if mmsize==8
    mova [r0+r1+FDEC_STRIDE*Y+8], m0
%endif
%assign Y Y+1
%endrep
%endif ; HIGH_BIT_DEPTH
    sub r1, 4*FDEC_STRIDEB
    jge .vloop
    REP_RET
%endmacro

INIT_MMX mmx2
PREDICT_16x16_H
INIT_XMM sse2
%ifdef HIGH_BIT_DEPTH
PREDICT_16x16_H
%else
;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
INIT_XMM ssse3
PREDICT_16x16_H
%endif

;-----------------------------------------------------------------------------
; void predict_16x16_dc_core( pixel *src, int i_dc_left )
;-----------------------------------------------------------------------------

%macro PRED16x16_DC 2
%ifdef HIGH_BIT_DEPTH
    mova       m0, [r0 - FDEC_STRIDEB+ 0]
    paddw      m0, [r0 - FDEC_STRIDEB+ 8]
    paddw      m0, [r0 - FDEC_STRIDEB+16]
    paddw      m0, [r0 - FDEC_STRIDEB+24]
    HADDW      m0, m1
    paddw      m0, %1
    psrlw      m0, %2
    SPLATW     m0, m0
    STORE16x16 m0, m0, m0, m0
%else ; !HIGH_BIT_DEPTH
    pxor        m0, m0
    pxor        m1, m1
    psadbw      m0, [r0 - FDEC_STRIDE]
    psadbw      m1, [r0 - FDEC_STRIDE + 8]
    paddusw     m0, m1
    paddusw     m0, %1
    psrlw       m0, %2                      ; dc
    pshufw      m0, m0, 0
    packuswb    m0, m0                      ; dc in bytes
    STORE16x16  m0, m0
%endif
%endmacro

INIT_MMX mmx2
cglobal predict_16x16_dc_core, 1,2
%ifdef ARCH_X86_64
    movd         m6, r1d
    PRED16x16_DC m6, 5
%else
    PRED16x16_DC r1m, 5
%endif
    REP_RET

INIT_MMX mmx2
cglobal predict_16x16_dc_top, 1,2
    PRED16x16_DC [pw_8], 4
    REP_RET

INIT_MMX mmx2
%ifdef HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,2
    movd       m0, r1m
    SPLATW     m0, m0
    STORE16x16 m0, m0, m0, m0
    REP_RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,1
    movd       m0, r1m
    pshufw     m0, m0, 0
    packuswb   m0, m0
    STORE16x16 m0, m0
    REP_RET
%endif

;-----------------------------------------------------------------------------
; void predict_16x16_dc_core( pixel *src, int i_dc_left )
;-----------------------------------------------------------------------------

%macro PRED16x16_DC_SSE2 2
%ifdef HIGH_BIT_DEPTH
    mova       m0, [r0 - FDEC_STRIDEB+ 0]
    paddw      m0, [r0 - FDEC_STRIDEB+16]
    HADDW      m0, m2
    paddw      m0, %1
    psrlw      m0, %2
    SPLATW     m0, m0
    STORE16x16_SSE2 m0, m0
%else ; !HIGH_BIT_DEPTH
    pxor        m0, m0
    psadbw      m0, [r0 - FDEC_STRIDE]
    movhlps     m1, m0
    paddw       m0, m1
    paddusw     m0, %1
    psrlw       m0, %2              ; dc
    SPLATW      m0, m0
    packuswb    m0, m0              ; dc in bytes
    STORE16x16_SSE2 m0
%endif
%endmacro

INIT_XMM sse2
cglobal predict_16x16_dc_core, 2,2,4
    movd       m3, r1m
    PRED16x16_DC_SSE2 m3, 5
    REP_RET

cglobal predict_16x16_dc_top, 1,2
    PRED16x16_DC_SSE2 [pw_8], 4
    REP_RET

INIT_XMM sse2
%ifdef HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,2
    movd       m0, r1m
    SPLATW     m0, m0
    STORE16x16_SSE2 m0, m0
    REP_RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,1
    movd       m0, r1m
    SPLATW     m0, m0
    packuswb   m0, m0
    STORE16x16_SSE2 m0
    RET
%endif
x264-snapshot-20120103-2245-stable/common/x86/pixel.h0000644000175000001440000002034611700673342021105 0ustar  videolanusers/*****************************************************************************
 * pixel.h: x86 pixel metrics
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_I386_PIXEL_H
#define X264_I386_PIXEL_H

#define DECL_PIXELS( ret, name, suffix, args ) \
    ret x264_pixel_##name##_16x16_##suffix args;\
    ret x264_pixel_##name##_16x8_##suffix args;\
    ret x264_pixel_##name##_8x16_##suffix args;\
    ret x264_pixel_##name##_8x8_##suffix args;\
    ret x264_pixel_##name##_8x4_##suffix args;\
    ret x264_pixel_##name##_4x16_##suffix args;\
    ret x264_pixel_##name##_4x8_##suffix args;\
    ret x264_pixel_##name##_4x4_##suffix args;\

#define DECL_X1( name, suffix ) \
    DECL_PIXELS( int, name, suffix, ( pixel *, int, pixel *, int ) )

#define DECL_X4( name, suffix ) \
    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, int, int * ) )\
    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int * ) )

DECL_X1( sad, mmx2 )
DECL_X1( sad, sse2 )
DECL_X4( sad, sse2_misalign )
DECL_X1( sad, sse3 )
DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
DECL_X1( sad, ssse3_aligned )
DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
DECL_X4( sad, ssse3 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
DECL_X1( ssd, sse2slow )
DECL_X1( ssd, sse2 )
DECL_X1( ssd, ssse3 )
DECL_X1( ssd, avx )
DECL_X1( ssd, xop )
DECL_X1( satd, mmx2 )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
DECL_X1( sad, cache64_ssse3 );
DECL_X4( sad, cache32_mmx2 );
DECL_X4( sad, cache64_mmx2 );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );

DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, avx,  ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, xop,  ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2,  ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2,  ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse4,  ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, avx,   ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, xop,   ( pixel *pix, int i_stride ))


void x264_intra_satd_x3_4x4_mmx2   ( pixel   *, pixel   *, int * );
void x264_intra_sad_x3_4x4_mmx2    ( pixel   *, pixel   *, int * );
void x264_intra_satd_x3_8x8c_mmx2  ( pixel   *, pixel   *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmx2   ( pixel   *, pixel   *, int * );
void x264_intra_sad_x3_8x8c_sse2   ( pixel   *, pixel   *, int * );
void x264_intra_sad_x3_8x8c_ssse3  ( pixel   *, pixel   *, int * );
void x264_intra_satd_x3_16x16_mmx2 ( pixel   *, pixel   *, int * );
void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_16x16_mmx2  ( pixel   *, pixel   *, int * );
void x264_intra_sad_x3_16x16_sse2  ( pixel   *, pixel   *, int * );
void x264_intra_sad_x3_16x16_ssse3 ( pixel   *, pixel   *, int * );
void x264_intra_sa8d_x3_8x8_mmx2   ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2   ( pixel   *, pixel   *, int * );
void x264_intra_sad_x3_8x8_mmx2    ( pixel   *, pixel   *, int * );
void x264_intra_sad_x3_8x8_sse2    ( pixel   *, pixel   *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx  ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_xop  ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_sse4  ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_avx   ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sa8d_x9_8x8_ssse3( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sa8d_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sa8d_x9_8x8_avx  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sad_x9_8x8_sse4  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sad_x9_8x8_avx   ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );

void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1,
                                    pixel *pixuv2, int stride2, int width,
                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1,
                                    pixel *pixuv2, int stride2, int width,
                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, int stride1,
                                    pixel *pixuv2, int stride2, int width,
                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, int stride1,
                                      const uint8_t *pix2, int stride2, int sums[2][4] );
void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, int stride1,
                                      const pixel *pix2, int stride2, int sums[2][4] );
void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, int stride1,
                                      const pixel *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
int  x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * );
int  x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int  x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
int  x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * );
int  x264_pixel_var2_8x16_mmx2( pixel *, int, pixel *, int, int * );
int  x264_pixel_var2_8x16_sse2( pixel *, int, pixel *, int, int * );
int  x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * );
int  x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * );
int  x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int  x264_pixel_vsad_sse2( pixel *src, int stride, int height );

#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
                                     uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
DECL_ADS( 4, mmx2 )
DECL_ADS( 2, mmx2 )
DECL_ADS( 1, mmx2 )
DECL_ADS( 4, sse2 )
DECL_ADS( 2, sse2 )
DECL_ADS( 1, sse2 )
DECL_ADS( 4, ssse3 )
DECL_ADS( 2, ssse3 )
DECL_ADS( 1, ssse3 )
DECL_ADS( 4, avx )
DECL_ADS( 2, avx )
DECL_ADS( 1, avx )

#undef DECL_PIXELS
#undef DECL_X1
#undef DECL_X4
#undef DECL_ADS

#endif
x264-snapshot-20120103-2245-stable/common/x86/pixel-a.asm0000644000175000001440000031774211700673342021665 0ustar  videolanusers;*****************************************************************************
;* pixel.asm: x86 pixel metrics
;*****************************************************************************
;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Holger Lubitz <holger@lubitz.org>
;*          Laurent Aimar <fenrir@via.ecp.fr>
;*          Alex Izvorski <aizvorksi@gmail.com>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*          Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA 32
mask_ff:   times 16 db 0xff
           times 16 db 0
%if BIT_DEPTH == 10
ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
pf_64:     times 4 dd 64.0
pf_128:    times 4 dd 128.0
%elif BIT_DEPTH == 9
ssim_c1:   times 4 dd 1671         ; .01*.01*511*511*64
ssim_c2:   times 4 dd 947556       ; .03*.03*511*511*64*63
%else ; 8-bit
ssim_c1:   times 4 dd 416          ; .01*.01*255*255*64
ssim_c2:   times 4 dd 235963       ; .03*.03*255*255*64*63
%endif
mask_ac4:  dw 0, -1, -1, -1, 0, -1, -1, -1
mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
mask_ac8:  dw 0, -1, -1, -1, -1, -1, -1, -1
hmul_4p:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
hmul_8p:   times 8 db 1
           times 4 db 1, -1
mask_10:   times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
pb_pppm:   times 4 db 1,1,1,-1
deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0

intrax9a_ddlr1: db  6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
intrax9a_ddlr2: db  8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
intrax9a_hdu1:  db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
intrax9a_hdu2:  db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
intrax9a_vrl1:  db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
intrax9a_vrl2:  db  2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
intrax9a_vh1:   db  6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
intrax9a_vh2:   db  6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
intrax9a_dc:    db  1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
intrax9a_lut:   db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
pw_s01234567:   dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
pw_s01234657:   dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
intrax9_edge:   db  0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15

intrax9b_ddlr1: db  6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
intrax9b_ddlr2: db  8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
intrax9b_hdu1:  db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
intrax9b_hdu2:  db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
intrax9b_vrl1:  db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
intrax9b_vrl2:  db  2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
intrax9b_vh1:   db  6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
intrax9b_vh2:   db  6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
intrax9b_edge2: db  6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
intrax9b_v1:    db  0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
intrax9b_v2:    db  2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
intrax9b_lut:   db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0

intra8x9_h1:   db  7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
intra8x9_h2:   db  6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
intra8x9_h3:   db  3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
intra8x9_h4:   db  2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
intra8x9_ddl1: db  1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
intra8x9_ddl2: db  2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
intra8x9_ddl3: db  5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
intra8x9_ddl4: db  6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
intra8x9_vl1:  db  0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
intra8x9_vl2:  db  1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
intra8x9_vl3:  db  2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
intra8x9_vl4:  db  3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
intra8x9_ddr1: db  8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
intra8x9_ddr2: db  7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
intra8x9_ddr3: db  4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
intra8x9_ddr4: db  3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
intra8x9_vr1:  db  8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
intra8x9_vr2:  db  8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
intra8x9_vr3:  db  5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
intra8x9_vr4:  db  4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
intra8x9_hd1:  db  3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
intra8x9_hd2:  db  2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
intra8x9_hd3:  db  7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
intra8x9_hd4:  db  5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
intra8x9_hu1:  db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
intra8x9_hu2:  db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
intra8x9_hu3:  db  5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
intra8x9_hu4:  db  3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
pw_s00112233:  dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
pw_s00001111:  dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001

transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15

sw_f0:     dq 0xfff0, 0
sq_0f:     dq 0xffffffff, 0
pd_f0:     times 4 dd 0xffff0000

SECTION .text

cextern pb_0
cextern pb_1
cextern pw_1
cextern pw_8
cextern pw_16
cextern pw_64
cextern pw_00ff
cextern pw_ppppmmmm
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
cextern pw_pmmpzzzz
cextern hsub_mul

;=============================================================================
; SSD
;=============================================================================

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
%macro SSD_ONE 2
cglobal pixel_ssd_%1x%2, 4,5,6
    mov     r4, %1*%2/mmsize
    pxor    m0, m0
.loop
    mova    m1, [r0]
%if %1 <= mmsize/2
    mova    m3, [r0+r1*2]
    %define offset r3*2
    %define num_rows 2
%else
    mova    m3, [r0+mmsize]
    %define offset mmsize
    %define num_rows 1
%endif
    lea     r0, [r0+r1*2*num_rows]
    psubw   m1, [r2]
    psubw   m3, [r2+offset]
    lea     r2, [r2+r3*2*num_rows]
    pmaddwd m1, m1
    pmaddwd m3, m3
    paddd   m0, m1
    paddd   m0, m3
    dec     r4
    jg .loop
    HADDD   m0, m5
    movd   eax, m0
    RET
%endmacro

%macro SSD_16_MMX 2
cglobal pixel_ssd_%1x%2, 4,5
    mov     r4, %1*%2/mmsize/2
    pxor    m0, m0
.loop
    mova    m1, [r0]
    mova    m2, [r2]
    mova    m3, [r0+mmsize]
    mova    m4, [r2+mmsize]
    mova    m5, [r0+mmsize*2]
    mova    m6, [r2+mmsize*2]
    mova    m7, [r0+mmsize*3]
    psubw   m1, m2
    psubw   m3, m4
    mova    m2, [r2+mmsize*3]
    psubw   m5, m6
    pmaddwd m1, m1
    psubw   m7, m2
    pmaddwd m3, m3
    pmaddwd m5, m5
    lea     r0, [r0+r1*2]
    lea     r2, [r2+r3*2]
    pmaddwd m7, m7
    paddd   m1, m3
    paddd   m5, m7
    paddd   m0, m1
    paddd   m0, m5
    dec     r4
    jg .loop
    HADDD   m0, m7
    movd   eax, m0
    RET
%endmacro

INIT_MMX mmx2
SSD_ONE     4,  4
SSD_ONE     4,  8
SSD_ONE     4, 16
SSD_ONE     8,  4
SSD_ONE     8,  8
SSD_ONE     8, 16
SSD_16_MMX 16,  8
SSD_16_MMX 16, 16
INIT_XMM sse2
SSD_ONE     8,  4
SSD_ONE     8,  8
SSD_ONE     8, 16
SSD_ONE    16,  8
SSD_ONE    16, 16
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
%macro SSD_LOAD_FULL 5
    mova      m1, [t0+%1]
    mova      m2, [t2+%2]
    mova      m3, [t0+%3]
    mova      m4, [t2+%4]
%if %5==1
    add       t0, t1
    add       t2, t3
%elif %5==2
    lea       t0, [t0+2*t1]
    lea       t2, [t2+2*t3]
%endif
%endmacro

%macro LOAD 5
    movh      m%1, %3
    movh      m%2, %4
%if %5
    lea       t0, [t0+2*t1]
%endif
%endmacro

%macro JOIN 7
    movh      m%3, %5
    movh      m%4, %6
%if %7
    lea       t2, [t2+2*t3]
%endif
    punpcklbw m%1, m7
    punpcklbw m%3, m7
    psubw     m%1, m%3
    punpcklbw m%2, m7
    punpcklbw m%4, m7
    psubw     m%2, m%4
%endmacro

%macro JOIN_SSE2 7
    movh      m%3, %5
    movh      m%4, %6
%if %7
    lea       t2, [t2+2*t3]
%endif
    punpcklqdq m%1, m%2
    punpcklqdq m%3, m%4
    DEINTB %2, %1, %4, %3, 7
    psubw m%2, m%4
    psubw m%1, m%3
%endmacro

%macro JOIN_SSSE3 7
    movh      m%3, %5
    movh      m%4, %6
%if %7
    lea       t2, [t2+2*t3]
%endif
    punpcklbw m%1, m%3
    punpcklbw m%2, m%4
%endmacro

%macro SSD_LOAD_HALF 5
    LOAD      1, 2, [t0+%1], [t0+%3], 1
    JOIN      1, 2, 3, 4, [t2+%2], [t2+%4], 1
    LOAD      3, 4, [t0+%1], [t0+%3], %5
    JOIN      3, 4, 5, 6, [t2+%2], [t2+%4], %5
%endmacro

%macro SSD_CORE 7-8
%ifidn %8, FULL
    mova      m%6, m%2
    mova      m%7, m%4
    psubusb   m%2, m%1
    psubusb   m%4, m%3
    psubusb   m%1, m%6
    psubusb   m%3, m%7
    por       m%1, m%2
    por       m%3, m%4
    punpcklbw m%2, m%1, m%5
    punpckhbw m%1, m%5
    punpcklbw m%4, m%3, m%5
    punpckhbw m%3, m%5
%endif
    pmaddwd   m%1, m%1
    pmaddwd   m%2, m%2
    pmaddwd   m%3, m%3
    pmaddwd   m%4, m%4
%endmacro

%macro SSD_CORE_SSE2 7-8
%ifidn %8, FULL
    DEINTB %6, %1, %7, %2, %5
    psubw m%6, m%7
    psubw m%1, m%2
    SWAP %6, %2, %1
    DEINTB %6, %3, %7, %4, %5
    psubw m%6, m%7
    psubw m%3, m%4
    SWAP %6, %4, %3
%endif
    pmaddwd   m%1, m%1
    pmaddwd   m%2, m%2
    pmaddwd   m%3, m%3
    pmaddwd   m%4, m%4
%endmacro

%macro SSD_CORE_SSSE3 7-8
%ifidn %8, FULL
    punpckhbw m%6, m%1, m%2
    punpckhbw m%7, m%3, m%4
    punpcklbw m%1, m%2
    punpcklbw m%3, m%4
    SWAP %6, %2, %3
    SWAP %7, %4
%endif
    pmaddubsw m%1, m%5
    pmaddubsw m%2, m%5
    pmaddubsw m%3, m%5
    pmaddubsw m%4, m%5
    pmaddwd   m%1, m%1
    pmaddwd   m%2, m%2
    pmaddwd   m%3, m%3
    pmaddwd   m%4, m%4
%endmacro

%macro SSD_ITER 6
    SSD_LOAD_%1 %2,%3,%4,%5,%6
    SSD_CORE  1, 2, 3, 4, 7, 5, 6, %1
    paddd     m1, m2
    paddd     m3, m4
    paddd     m0, m1
    paddd     m0, m3
%endmacro

;-----------------------------------------------------------------------------
; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 2
%if %1 != %2
    %assign function_align 8
%else
    %assign function_align 16
%endif
cglobal pixel_ssd_%1x%2, 0,0,0
    mov     al, %1*%2/mmsize/2

%if %1 != %2
    jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
%else

.startloop:
%ifdef ARCH_X86_64
    DECLARE_REG_TMP 0,1,2,3
    PROLOGUE 0,0,8
%else
    PROLOGUE 0,5
    DECLARE_REG_TMP 1,2,3,4
    mov t0, r0m
    mov t1, r1m
    mov t2, r2m
    mov t3, r3m
%endif

%if cpuflag(ssse3)
    mova    m7, [hsub_mul]
%elifidn cpuname, sse2
    mova    m7, [pw_00ff]
%elif %1 >= mmsize
    pxor    m7, m7
%endif
    pxor    m0, m0

ALIGN 16
.loop:
%if %1 > mmsize
    SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
%elif %1 == mmsize
    SSD_ITER FULL, 0, 0, t1, t3, 2
%else
    SSD_ITER HALF, 0, 0, t1, t3, 2
%endif
    dec     al
    jg .loop
    HADDD   m0, m1
    movd   eax, m0
    RET
%endif
%endmacro

INIT_MMX mmx
SSD 16, 16
SSD 16,  8
SSD  8,  8
SSD  8, 16
SSD  4,  4
SSD  8,  4
SSD  4,  8
SSD  4, 16
INIT_XMM sse2slow
SSD 16, 16
SSD  8,  8
SSD 16,  8
SSD  8, 16
SSD  8,  4
INIT_XMM sse2
%define SSD_CORE SSD_CORE_SSE2
%define JOIN JOIN_SSE2
SSD 16, 16
SSD  8,  8
SSD 16,  8
SSD  8, 16
SSD  8,  4
INIT_XMM ssse3
%define SSD_CORE SSD_CORE_SSSE3
%define JOIN JOIN_SSSE3
SSD 16, 16
SSD  8,  8
SSD 16,  8
SSD  8, 16
SSD  8,  4
INIT_XMM avx
SSD 16, 16
SSD  8,  8
SSD 16,  8
SSD  8, 16
SSD  8,  4
INIT_MMX ssse3
SSD  4,  4
SSD  4,  8
SSD  4, 16
INIT_XMM xop
SSD 16, 16
SSD  8,  8
SSD 16,  8
SSD  8, 16
SSD  8,  4
%assign function_align 16
%endif ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2,
;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; The maximum width this function can handle without risk of overflow is given
; in the following equation: (mmsize in bits)
;
;   2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
;
; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
; distortion levels it will take much more than that though.
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7,7
    shl        r4d, 2
    FIX_STRIDES r1, r3
    add         r0, r4
    add         r2, r4
    xor         r6, r6
    pxor        m4, m4
    pxor        m5, m5
    pxor        m6, m6
.loopy:
    mov         r6, r4
    neg         r6
    pxor        m2, m2
    pxor        m3, m3
.loopx:
    mova        m0, [r0+r6]
    mova        m1, [r0+r6+mmsize]
    psubw       m0, [r2+r6]
    psubw       m1, [r2+r6+mmsize]
    PSHUFLW     m0, m0, q3120
    PSHUFLW     m1, m1, q3120
%if mmsize==16
    pshufhw     m0, m0, q3120
    pshufhw     m1, m1, q3120
%endif
    pmaddwd     m0, m0
    pmaddwd     m1, m1
    paddd       m2, m0
    paddd       m3, m1
    add         r6, 2*mmsize
    jl .loopx
%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
               ; equation above, putting the width limit at 8208
    punpckhdq   m0, m2, m6
    punpckhdq   m1, m3, m6
    punpckldq   m2, m6
    punpckldq   m3, m6
    paddq       m3, m2
    paddq       m1, m0
    paddq       m4, m3
    paddq       m4, m1
%else ; unfortunately paddq is sse2
      ; emulate 48 bit precision for mmx2 instead
    mova        m0, m2
    mova        m1, m3
    punpcklwd   m2, m6
    punpcklwd   m3, m6
    punpckhwd   m0, m6
    punpckhwd   m1, m6
    paddd       m3, m2
    paddd       m1, m0
    paddd       m4, m3
    paddd       m5, m1
%endif
    add         r0, r1
    add         r2, r3
    dec        r5d
    jg .loopy
    mov         r3, r6m
    mov         r4, r7m
%if mmsize==16
    movq      [r3], m4
    movhps    [r4], m4
%else ; fixup for mmx2
    SBUTTERFLY dq, 4, 5, 0
    mova        m0, m4
    psrld       m4, 16
    paddd       m5, m4
    pslld       m0, 16
    SBUTTERFLY dq, 0, 5, 4
    psrlq       m0, 16
    psrlq       m5, 16
    movq      [r3], m0
    movq      [r4], m5
%endif
    RET
%endmacro ; SSD_NV12
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2,
;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; This implementation can potentially overflow on image widths >= 11008 (or
; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
; 20). At sane distortion levels it will take much more than that though.
;-----------------------------------------------------------------------------
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7
    shl    r4d, 1
    add     r0, r4
    add     r2, r4
    pxor    m3, m3
    pxor    m4, m4
    mova    m5, [pw_00ff]
.loopy:
    mov     r6, r4
    neg     r6
.loopx:
    mova    m0, [r0+r6]
    mova    m1, [r2+r6]
    psubusb m0, m1
    psubusb m1, [r0+r6]
    por     m0, m1
    psrlw   m2, m0, 8
    pand    m0, m5
    pmaddwd m2, m2
    pmaddwd m0, m0
    paddd   m3, m0
    paddd   m4, m2
    add     r6, mmsize
    jl .loopx
    add     r0, r1
    add     r2, r3
    dec    r5d
    jg .loopy
    mov     r3, r6m
    mov     r4, r7m
    mova    m5, [sq_0f]
    HADDD   m3, m0
    HADDD   m4, m0
    pand    m3, m5
    pand    m4, m5
    movq  [r3], m3
    movq  [r4], m4
    RET
%endmacro ; SSD_NV12
%endif ; !HIGH_BIT_DEPTH

INIT_MMX mmx2
SSD_NV12
INIT_XMM sse2
SSD_NV12
INIT_XMM avx
SSD_NV12

;=============================================================================
; variance
;=============================================================================

%macro VAR_START 1
    pxor  m5, m5    ; sum
    pxor  m6, m6    ; sum squared
%ifndef HIGH_BIT_DEPTH
%if %1
    mova  m7, [pw_00ff]
%else
    pxor  m7, m7    ; zero
%endif
%endif ; !HIGH_BIT_DEPTH
%endmacro

%macro VAR_END 2
%ifdef HIGH_BIT_DEPTH
%if mmsize == 8 && %1*%2 == 256
    HADDUW  m5, m2
%else
    HADDW   m5, m2
%endif
%else ; !HIGH_BIT_DEPTH
    HADDW   m5, m2
%endif ; HIGH_BIT_DEPTH
    movd   eax, m5
    HADDD   m6, m1
    movd   edx, m6
%ifdef ARCH_X86_64
    shl    rdx, 32
    add    rax, rdx
%endif
    RET
%endmacro

%macro VAR_CORE 0
    paddw     m5, m0
    paddw     m5, m3
    paddw     m5, m1
    paddw     m5, m4
    pmaddwd   m0, m0
    pmaddwd   m3, m3
    pmaddwd   m1, m1
    pmaddwd   m4, m4
    paddd     m6, m0
    paddd     m6, m3
    paddd     m6, m1
    paddd     m6, m4
%endmacro

%macro VAR_2ROW 2
    mov      r2d, %2
.loop:
%ifdef HIGH_BIT_DEPTH
    mova      m0, [r0]
    mova      m1, [r0+mmsize]
    mova      m3, [r0+%1]
    mova      m4, [r0+%1+mmsize]
%else ; !HIGH_BIT_DEPTH
    mova      m0, [r0]
    punpckhbw m1, m0, m7
    mova      m3, [r0+%1]
    mova      m4, m3
    punpcklbw m0, m7
%endif ; HIGH_BIT_DEPTH
%ifidn %1, r1
    lea       r0, [r0+%1*2]
%else
    add       r0, r1
%endif
%ifndef HIGH_BIT_DEPTH
    punpcklbw m3, m7
    punpckhbw m4, m7
%endif ; !HIGH_BIT_DEPTH
    VAR_CORE
    dec r2d
    jg .loop
%endmacro

;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
    FIX_STRIDES r1
    VAR_START 0
    VAR_2ROW 8*SIZEOF_PIXEL, 16
    VAR_END 16, 16

cglobal pixel_var_8x16, 2,3
    FIX_STRIDES r1
    VAR_START 0
    VAR_2ROW r1, 8
    VAR_END 8, 16

cglobal pixel_var_8x8, 2,3
    FIX_STRIDES r1
    VAR_START 0
    VAR_2ROW r1, 4
    VAR_END 8, 8

%ifdef HIGH_BIT_DEPTH
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
    FIX_STRIDES r1
    VAR_START 0
    VAR_2ROW r1, 8
    VAR_END 16, 16

cglobal pixel_var_8x8, 2,3,8
    lea       r2, [r1*3]
    VAR_START 0
    mova      m0, [r0]
    mova      m1, [r0+r1*2]
    mova      m3, [r0+r1*4]
    mova      m4, [r0+r2*2]
    lea       r0, [r0+r1*8]
    VAR_CORE
    mova      m0, [r0]
    mova      m1, [r0+r1*2]
    mova      m3, [r0+r1*4]
    mova      m4, [r0+r2*2]
    VAR_CORE
    VAR_END 8, 8
%endmacro ; VAR

INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
    VAR_START 1
    mov      r2d, 8
.loop:
    mova      m0, [r0]
    mova      m3, [r0+r1]
    DEINTB    1, 0, 4, 3, 7
    lea       r0, [r0+r1*2]
    VAR_CORE
    dec r2d
    jg .loop
    VAR_END 16, 16

cglobal pixel_var_8x8, 2,4,8
    VAR_START 1
    mov      r2d, 2
    lea       r3, [r1*3]
.loop:
    movh      m0, [r0]
    movh      m3, [r0+r1]
    movhps    m0, [r0+r1*2]
    movhps    m3, [r0+r3]
    DEINTB    1, 0, 4, 3, 7
    lea       r0, [r0+r1*4]
    VAR_CORE
    dec r2d
    jg .loop
    VAR_END 8, 8

cglobal pixel_var_8x16, 2,4,8
    VAR_START 1
    mov      r2d, 4
    lea       r3, [r1*3]
.loop:
    movh      m0, [r0]
    movh      m3, [r0+r1]
    movhps    m0, [r0+r1*2]
    movhps    m3, [r0+r3]
    DEINTB    1, 0, 4, 3, 7
    lea       r0, [r0+r1*4]
    VAR_CORE
    dec r2d
    jg .loop
    VAR_END 8, 16
%endmacro ; VAR

INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; !HIGH_BIT_DEPTH

%macro VAR2_END 1
    HADDW   m5, m7
    movd   r1d, m5
    imul   r1d, r1d
    HADDD   m6, m1
    shr    r1d, %1
    movd   eax, m6
    mov   [r4], eax
    sub    eax, r1d  ; sqr - (sum * sum >> shift)
    RET
%endmacro

;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
%macro VAR2_8x8_MMX 2
cglobal pixel_var2_8x%1, 5,6
    FIX_STRIDES r1, r3
    VAR_START 0
    mov      r5d, %1
.loop:
%ifdef HIGH_BIT_DEPTH
    mova      m0, [r0]
    mova      m1, [r0+mmsize]
    psubw     m0, [r2]
    psubw     m1, [r2+mmsize]
%else ; !HIGH_BIT_DEPTH
    movq      m0, [r0]
    movq      m1, m0
    movq      m2, [r2]
    movq      m3, m2
    punpcklbw m0, m7
    punpckhbw m1, m7
    punpcklbw m2, m7
    punpckhbw m3, m7
    psubw     m0, m2
    psubw     m1, m3
%endif ; HIGH_BIT_DEPTH
    paddw     m5, m0
    paddw     m5, m1
    pmaddwd   m0, m0
    pmaddwd   m1, m1
    paddd     m6, m0
    paddd     m6, m1
    add       r0, r1
    add       r2, r3
    dec       r5d
    jg .loop
    VAR2_END %2
%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx2
VAR2_8x8_MMX  8, 6
VAR2_8x8_MMX 16, 7
%endif

%macro VAR2_8x8_SSE2 2
cglobal pixel_var2_8x%1, 5,6,8
    VAR_START 1
    mov      r5d, %1/2
.loop:
%ifdef HIGH_BIT_DEPTH
    mova      m0, [r0]
    mova      m1, [r0+r1*2]
    mova      m2, [r2]
    mova      m3, [r2+r3*2]
%else ; !HIGH_BIT_DEPTH
    movq      m1, [r0]
    movhps    m1, [r0+r1]
    movq      m3, [r2]
    movhps    m3, [r2+r3]
    DEINTB    0, 1, 2, 3, 7
%endif ; HIGH_BIT_DEPTH
    psubw     m0, m2
    psubw     m1, m3
    paddw     m5, m0
    paddw     m5, m1
    pmaddwd   m0, m0
    pmaddwd   m1, m1
    paddd     m6, m0
    paddd     m6, m1
    lea       r0, [r0+r1*2*SIZEOF_PIXEL]
    lea       r2, [r2+r3*2*SIZEOF_PIXEL]
    dec      r5d
    jg .loop
    VAR2_END %2
%endmacro

INIT_XMM sse2
VAR2_8x8_SSE2  8, 6
VAR2_8x8_SSE2 16, 7

%ifndef HIGH_BIT_DEPTH
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 5,6,8
    pxor      m5, m5    ; sum
    pxor      m6, m6    ; sum squared
    mova      m7, [hsub_mul]
    mov      r5d, %1/4
.loop:
    movq      m0, [r0]
    movq      m2, [r2]
    movq      m1, [r0+r1]
    movq      m3, [r2+r3]
    lea       r0, [r0+r1*2]
    lea       r2, [r2+r3*2]
    punpcklbw m0, m2
    punpcklbw m1, m3
    movq      m2, [r0]
    movq      m3, [r2]
    punpcklbw m2, m3
    movq      m3, [r0+r1]
    movq      m4, [r2+r3]
    punpcklbw m3, m4
    pmaddubsw m0, m7
    pmaddubsw m1, m7
    pmaddubsw m2, m7
    pmaddubsw m3, m7
    paddw     m5, m0
    paddw     m5, m1
    paddw     m5, m2
    paddw     m5, m3
    pmaddwd   m0, m0
    pmaddwd   m1, m1
    pmaddwd   m2, m2
    pmaddwd   m3, m3
    paddd     m6, m0
    paddd     m6, m1
    paddd     m6, m2
    paddd     m6, m3
    lea       r0, [r0+r1*2]
    lea       r2, [r2+r3*2]
    dec      r5d
    jg .loop
    VAR2_END %2
%endmacro

INIT_XMM ssse3
VAR2_8x8_SSSE3  8, 6
VAR2_8x8_SSSE3 16, 7
INIT_XMM xop
VAR2_8x8_SSSE3  8, 6
VAR2_8x8_SSSE3 16, 7

%endif ; !HIGH_BIT_DEPTH

;=============================================================================
; SATD
;=============================================================================

%macro JDUP 2
%if cpuflag(sse4)
    ; just use shufps on anything post conroe
    shufps %1, %2, 0
%elif cpuflag(ssse3)
    ; join 2x 32 bit and duplicate them
    ; emulating shufps is faster on conroe
    punpcklqdq %1, %2
    movsldup %1, %1
%else
    ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
    punpckldq %1, %2
%endif
%endmacro

%macro HSUMSUB 5
    pmaddubsw m%2, m%5
    pmaddubsw m%1, m%5
    pmaddubsw m%4, m%5
    pmaddubsw m%3, m%5
%endmacro

%macro DIFF_UNPACK_SSE2 5
    punpcklbw m%1, m%5
    punpcklbw m%2, m%5
    punpcklbw m%3, m%5
    punpcklbw m%4, m%5
    psubw m%1, m%2
    psubw m%3, m%4
%endmacro

%macro DIFF_SUMSUB_SSSE3 5
    HSUMSUB %1, %2, %3, %4, %5
    psubw m%1, m%2
    psubw m%3, m%4
%endmacro

%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
    movd %1, %3
    movd %2, %4
    JDUP %1, %2
%endmacro

%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
    movddup m%3, %6
    movddup m%4, %8
    movddup m%1, %5
    movddup m%2, %7
%endmacro

%macro LOAD_DUP_4x8P_PENRYN 8
    ; penryn and nehalem run punpcklqdq and movddup in different units
    movh m%3, %6
    movh m%4, %8
    punpcklqdq m%3, m%3
    movddup m%1, %5
    punpcklqdq m%4, m%4
    movddup m%2, %7
%endmacro

%macro LOAD_SUMSUB_8x2P 9
    LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
%endmacro

%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
    LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
    LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
%if %10
    lea %8, [%8+4*r1]
    lea %9, [%9+4*r3]
%endif
%endmacro

%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
    movddup m%1, [%7]
    movddup m%2, [%7+8]
    mova m%4, [%6]
    movddup m%3, m%4
    punpckhqdq m%4, m%4
    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
%endmacro

%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
    movu  m%4, [%7]
    mova  m%2, [%6]
    DEINTB %1, %2, %3, %4, %5
    psubw m%1, m%3
    psubw m%2, m%4
    SUMSUB_BA w, %1, %2, %3
%endmacro

%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
    LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
    LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
    LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
    LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
%endmacro

; in: r4=3*stride1, r5=3*stride2
; in: %2 = horizontal offset
; in: %3 = whether we need to increment pix1 and pix2
; clobber: m3..m7
; out: %1 = satd
%macro SATD_4x4_MMX 3
    %xdefine %%n n%1
    %assign offset %2*SIZEOF_PIXEL
    LOAD_DIFF m4, m3, none, [r0+     offset], [r2+     offset]
    LOAD_DIFF m5, m3, none, [r0+  r1+offset], [r2+  r3+offset]
    LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
    LOAD_DIFF m7, m3, none, [r0+  r4+offset], [r2+  r5+offset]
%if %3
    lea  r0, [r0+4*r1]
    lea  r2, [r2+4*r3]
%endif
    HADAMARD4_2D 4, 5, 6, 7, 3, %%n
    paddw m4, m6
    SWAP %%n, 4
%endmacro

%macro SATD_8x4_SSE 8-9
%ifidn %1, sse2
    HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
%else
    HADAMARD4_V %2, %3, %4, %5, %6
    ; doing the abs first is a slight advantage
    ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
    ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
    HADAMARD 1, max, %2, %4, %6, %7
%endif
%ifnidn %9, swap
    paddw m%8, m%2
%else
    SWAP %8, %2
%endif
%ifidn %1, sse2
    paddw m%8, m%4
%else
    HADAMARD 1, max, %3, %5, %6, %7
    paddw m%8, m%3
%endif
%endmacro

%macro SATD_START_MMX 0
    FIX_STRIDES r1, r3
    lea  r4, [3*r1] ; 3*stride1
    lea  r5, [3*r3] ; 3*stride2
%endmacro

%macro SATD_END_MMX 0
%ifdef HIGH_BIT_DEPTH
    HADDUW      m0, m1
    movd       eax, m0
%else ; !HIGH_BIT_DEPTH
    pshufw      m1, m0, q1032
    paddw       m0, m1
    pshufw      m1, m0, q2301
    paddw       m0, m1
    movd       eax, m0
    and        eax, 0xffff
%endif ; HIGH_BIT_DEPTH
    RET
%endmacro

; FIXME avoid the spilling of regs to hold 3*stride.
; for small blocks on x86_32, modify pixel pointer instead.

;-----------------------------------------------------------------------------
; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_satd_16x4_internal
    SATD_4x4_MMX m2,  0, 0
    SATD_4x4_MMX m1,  4, 0
    paddw        m0, m2
    SATD_4x4_MMX m2,  8, 0
    paddw        m0, m1
    SATD_4x4_MMX m1, 12, 0
    paddw        m0, m2
    paddw        m0, m1
    ret

cglobal pixel_satd_8x8_internal
    SATD_4x4_MMX m2,  0, 0
    SATD_4x4_MMX m1,  4, 1
    paddw        m0, m2
    paddw        m0, m1
pixel_satd_8x4_internal_mmx2:
    SATD_4x4_MMX m2,  0, 0
    SATD_4x4_MMX m1,  4, 0
    paddw        m0, m2
    paddw        m0, m1
    ret

%ifdef HIGH_BIT_DEPTH
%macro SATD_MxN_MMX 3
cglobal pixel_satd_%1x%2, 4,7
    SATD_START_MMX
    pxor   m0, m0
    call pixel_satd_%1x%3_internal_mmx2
    HADDUW m0, m1
    movd  r6d, m0
%rep %2/%3-1
    pxor   m0, m0
    lea    r0, [r0+4*r1]
    lea    r2, [r2+4*r3]
    call pixel_satd_%1x%3_internal_mmx2
    movd   m2, r4
    HADDUW m0, m1
    movd   r4, m0
    add    r6, r4
    movd   r4, m2
%endrep
    movifnidn eax, r6d
    RET
%endmacro

SATD_MxN_MMX 16, 16, 4
SATD_MxN_MMX 16,  8, 4
SATD_MxN_MMX  8, 16, 8
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
cglobal pixel_satd_16x16, 4,6
    SATD_START_MMX
    pxor   m0, m0
%rep 3
    call pixel_satd_16x4_internal_mmx2
    lea  r0, [r0+4*r1]
    lea  r2, [r2+4*r3]
%endrep
    call pixel_satd_16x4_internal_mmx2
    HADDUW m0, m1
    movd  eax, m0
    RET

cglobal pixel_satd_16x8, 4,6
    SATD_START_MMX
    pxor   m0, m0
    call pixel_satd_16x4_internal_mmx2
    lea  r0, [r0+4*r1]
    lea  r2, [r2+4*r3]
    call pixel_satd_16x4_internal_mmx2
    SATD_END_MMX

cglobal pixel_satd_8x16, 4,6
    SATD_START_MMX
    pxor   m0, m0
    call pixel_satd_8x8_internal_mmx2
    lea  r0, [r0+4*r1]
    lea  r2, [r2+4*r3]
    call pixel_satd_8x8_internal_mmx2
    SATD_END_MMX
%endif ; !HIGH_BIT_DEPTH

cglobal pixel_satd_8x8, 4,6
    SATD_START_MMX
    pxor   m0, m0
    call pixel_satd_8x8_internal_mmx2
    SATD_END_MMX

cglobal pixel_satd_8x4, 4,6
    SATD_START_MMX
    pxor   m0, m0
    call pixel_satd_8x4_internal_mmx2
    SATD_END_MMX

cglobal pixel_satd_4x16, 4,6
    SATD_START_MMX
    SATD_4x4_MMX m0, 0, 1
    SATD_4x4_MMX m1, 0, 1
    paddw  m0, m1
    SATD_4x4_MMX m1, 0, 1
    paddw  m0, m1
    SATD_4x4_MMX m1, 0, 0
    paddw  m0, m1
    SATD_END_MMX

cglobal pixel_satd_4x8, 4,6
    SATD_START_MMX
    SATD_4x4_MMX m0, 0, 1
    SATD_4x4_MMX m1, 0, 0
    paddw  m0, m1
    SATD_END_MMX

cglobal pixel_satd_4x4, 4,6
    SATD_START_MMX
    SATD_4x4_MMX m0, 0, 0
    SATD_END_MMX

%macro SATD_START_SSE2 2
%if cpuflag(ssse3)
    mova    %2, [hmul_8p]
%endif
    lea     r4, [3*r1]
    lea     r5, [3*r3]
    pxor    %1, %1
%endmacro

%macro SATD_END_SSE2 1
    HADDW   %1, m7
    movd   eax, %1
    RET
%endmacro

%macro BACKUP_POINTERS 0
%ifdef ARCH_X86_64
    mov    r10, r0
    mov    r11, r2
%endif
%endmacro

%macro RESTORE_AND_INC_POINTERS 0
%ifdef ARCH_X86_64
    lea     r0, [r10+8]
    lea     r2, [r11+8]
%else
    mov     r0, r0mp
    mov     r2, r2mp
    add     r0, 8
    add     r2, 8
%endif
%endmacro

%macro SATD_4x8_SSE 2
    movd m4, [r2]
    movd m5, [r2+r3]
    movd m6, [r2+2*r3]
    add r2, r5
    movd m0, [r0]
    movd m1, [r0+r1]
    movd m2, [r0+2*r1]
    add r0, r4
    movd m3, [r2+r3]
    JDUP m4, m3
    movd m3, [r0+r1]
    JDUP m0, m3
    movd m3, [r2+2*r3]
    JDUP m5, m3
    movd m3, [r0+2*r1]
    JDUP m1, m3
%if cpuflag(ssse3) && %1==1
    mova m3, [hmul_4p]
    DIFFOP 0, 4, 1, 5, 3
%else
    DIFFOP 0, 4, 1, 5, 7
%endif
    movd m5, [r2]
    add r2, r5
    movd m3, [r0]
    add r0, r4
    movd m4, [r2]
    JDUP m6, m4
    movd m4, [r0]
    JDUP m2, m4
    movd m4, [r2+r3]
    JDUP m5, m4
    movd m4, [r0+r1]
    JDUP m3, m4
%if cpuflag(ssse3) && %1==1
    mova m4, [hmul_4p]
    DIFFOP 2, 6, 3, 5, 4
%else
    DIFFOP 2, 6, 3, 5, 7
%endif
    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2
%endmacro

;-----------------------------------------------------------------------------
; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 0
%if cpuflag(ssse3)
cglobal pixel_satd_4x4, 4, 6, 6
    SATD_START_MMX
    mova m4, [hmul_4p]
    LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
    LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
    LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
    LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
    DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
    HADAMARD 0, sumsub, 0, 1, 2, 3
    HADAMARD 4, sumsub, 0, 1, 2, 3
    HADAMARD 1, amax, 0, 1, 2, 3
    HADDW m0, m1
    movd eax, m0
    RET
%endif

cglobal pixel_satd_4x8, 4, 6, 8
    SATD_START_MMX
%if cpuflag(ssse3)
    mova m7, [hmul_4p]
%endif
    SATD_4x8_SSE 0, swap
    HADDW m7, m1
    movd eax, m7
    RET

cglobal pixel_satd_4x16, 4, 6, 8
    SATD_START_MMX
%if cpuflag(ssse3)
    mova m7, [hmul_4p]
%endif
    SATD_4x8_SSE 0, swap
    lea r0, [r0+r1*2]
    lea r2, [r2+r3*2]
    SATD_4x8_SSE 1, add
    HADDW m7, m1
    movd eax, m7
    RET

cglobal pixel_satd_8x8_internal
    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
%%pixel_satd_8x4_internal:
    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
    ret

%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
cglobal pixel_satd_16x4_internal
    LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
    lea  r2, [r2+4*r3]
    lea  r0, [r0+4*r1]
    ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2?
    SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
    SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
    ret

cglobal pixel_satd_16x8, 4,6,12
    SATD_START_SSE2 m10, m7
%if notcpuflag(ssse3)
    mova m7, [pw_00ff]
%endif
    jmp %%pixel_satd_16x8_internal

cglobal pixel_satd_16x16, 4,6,12
    SATD_START_SSE2 m10, m7
%if notcpuflag(ssse3)
    mova m7, [pw_00ff]
%endif
    call pixel_satd_16x4_internal
    call pixel_satd_16x4_internal
%%pixel_satd_16x8_internal:
    call pixel_satd_16x4_internal
    call pixel_satd_16x4_internal
    SATD_END_SSE2 m10
%else
cglobal pixel_satd_16x8, 4,6,8
    SATD_START_SSE2 m6, m7
    BACKUP_POINTERS
    call pixel_satd_8x8_internal
    RESTORE_AND_INC_POINTERS
    call pixel_satd_8x8_internal
    SATD_END_SSE2 m6

cglobal pixel_satd_16x16, 4,6,8
    SATD_START_SSE2 m6, m7
    BACKUP_POINTERS
    call pixel_satd_8x8_internal
    call pixel_satd_8x8_internal
    RESTORE_AND_INC_POINTERS
    call pixel_satd_8x8_internal
    call pixel_satd_8x8_internal
    SATD_END_SSE2 m6
%endif

cglobal pixel_satd_8x16, 4,6,8
    SATD_START_SSE2 m6, m7
    call pixel_satd_8x8_internal
    call pixel_satd_8x8_internal
    SATD_END_SSE2 m6

cglobal pixel_satd_8x8, 4,6,8
    SATD_START_SSE2 m6, m7
    call pixel_satd_8x8_internal
    SATD_END_SSE2 m6

cglobal pixel_satd_8x4, 4,6,8
    SATD_START_SSE2 m6, m7
    call %%pixel_satd_8x4_internal
    SATD_END_SSE2 m6
%endmacro ; SATDS_SSE2

%macro SA8D_INTER 0
%ifdef ARCH_X86_64
    %define lh m10
    %define rh m0
%else
    %define lh m0
    %define rh [esp+48]
%endif
%ifdef HIGH_BIT_DEPTH
    HADDUW  m0, m1
    paddd   lh, rh
%else
    paddusw lh, rh
%endif ; HIGH_BIT_DEPTH
%endmacro

%macro SA8D 0
%ifdef HIGH_BIT_DEPTH
    %define vertical 1
%else ; sse2 doesn't seem to like the horizontal way of doing things
    %define vertical (cpuflags == cpuflags_sse2)
%endif

%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
    lea  r10, [r0+4*r1]
    lea  r11, [r2+4*r3]
    LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
    LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
%if vertical
    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
%else ; non-sse2
    HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
%endif
    paddw m0, m1
    paddw m0, m2
    paddw m0, m8
    SAVE_MM_PERMUTATION
    ret

cglobal pixel_sa8d_8x8, 4,6,12
    FIX_STRIDES r1, r3
    lea  r4, [3*r1]
    lea  r5, [3*r3]
%if vertical == 0
    mova m7, [hmul_8p]
%endif
    call pixel_sa8d_8x8_internal
%ifdef HIGH_BIT_DEPTH
    HADDUW m0, m1
%else
    HADDW m0, m1
%endif ; HIGH_BIT_DEPTH
    movd eax, m0
    add eax, 1
    shr eax, 1
    RET

cglobal pixel_sa8d_16x16, 4,6,12
    FIX_STRIDES r1, r3
    lea  r4, [3*r1]
    lea  r5, [3*r3]
%if vertical == 0
    mova m7, [hmul_8p]
%endif
    call pixel_sa8d_8x8_internal ; pix[0]
    add  r2, 8*SIZEOF_PIXEL
    add  r0, 8*SIZEOF_PIXEL
%ifdef HIGH_BIT_DEPTH
    HADDUW m0, m1
%endif
    mova m10, m0
    call pixel_sa8d_8x8_internal ; pix[8]
    lea  r2, [r2+8*r3]
    lea  r0, [r0+8*r1]
    SA8D_INTER
    call pixel_sa8d_8x8_internal ; pix[8*stride+8]
    sub  r2, 8*SIZEOF_PIXEL
    sub  r0, 8*SIZEOF_PIXEL
    SA8D_INTER
    call pixel_sa8d_8x8_internal ; pix[8*stride]
    SA8D_INTER
    SWAP 0, 10
%ifndef HIGH_BIT_DEPTH
    HADDUW m0, m1
%endif
    movd eax, m0
    add  eax, 1
    shr  eax, 1
    RET

%else ; ARCH_X86_32
%if mmsize == 16
cglobal pixel_sa8d_8x8_internal
    %define spill0 [esp+4]
    %define spill1 [esp+20]
    %define spill2 [esp+36]
%if vertical
    LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
    HADAMARD4_2D 0, 1, 2, 3, 4
    movdqa spill0, m3
    LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
    HADAMARD4_2D 4, 5, 6, 7, 3
    HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
    movdqa m3, spill0
    paddw m0, m1
    HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
%else ; mmsize == 8
    mova m7, [hmul_8p]
    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
    ; could do first HADAMARD4_V here to save spilling later
    ; surprisingly, not a win on conroe or even p4
    mova spill0, m2
    mova spill1, m3
    mova spill2, m1
    SWAP 1, 7
    LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
    HADAMARD4_V 4, 5, 6, 7, 3
    mova m1, spill2
    mova m2, spill0
    mova m3, spill1
    mova spill0, m6
    mova spill1, m7
    HADAMARD4_V 0, 1, 2, 3, 7
    SUMSUB_BADC w, 0, 4, 1, 5, 7
    HADAMARD 2, sumsub, 0, 4, 7, 6
    HADAMARD 2, sumsub, 1, 5, 7, 6
    HADAMARD 1, amax, 0, 4, 7, 6
    HADAMARD 1, amax, 1, 5, 7, 6
    mova m6, spill0
    mova m7, spill1
    paddw m0, m1
    SUMSUB_BADC w, 2, 6, 3, 7, 4
    HADAMARD 2, sumsub, 2, 6, 4, 5
    HADAMARD 2, sumsub, 3, 7, 4, 5
    HADAMARD 1, amax, 2, 6, 4, 5
    HADAMARD 1, amax, 3, 7, 4, 5
%endif ; sse2/non-sse2
    paddw m0, m2
    paddw m0, m3
    SAVE_MM_PERMUTATION
    ret
%endif ; ifndef mmx2

cglobal pixel_sa8d_8x8, 4,7
    FIX_STRIDES r1, r3
    mov    r6, esp
    and   esp, ~15
    sub   esp, 48
    lea    r4, [3*r1]
    lea    r5, [3*r3]
    call pixel_sa8d_8x8_internal
%ifdef HIGH_BIT_DEPTH
    HADDUW m0, m1
%else
    HADDW  m0, m1
%endif ; HIGH_BIT_DEPTH
    movd  eax, m0
    add   eax, 1
    shr   eax, 1
    mov   esp, r6
    RET

cglobal pixel_sa8d_16x16, 4,7
    FIX_STRIDES r1, r3
    mov  r6, esp
    and  esp, ~15
    sub  esp, 64
    lea  r4, [3*r1]
    lea  r5, [3*r3]
    call pixel_sa8d_8x8_internal
%if mmsize == 8
    lea  r0, [r0+4*r1]
    lea  r2, [r2+4*r3]
%endif
%ifdef HIGH_BIT_DEPTH
    HADDUW m0, m1
%endif
    mova [esp+48], m0
    call pixel_sa8d_8x8_internal
    mov  r0, [r6+20]
    mov  r2, [r6+28]
    add  r0, 8*SIZEOF_PIXEL
    add  r2, 8*SIZEOF_PIXEL
    SA8D_INTER
    mova [esp+48], m0
    call pixel_sa8d_8x8_internal
%if mmsize == 8
    lea  r0, [r0+4*r1]
    lea  r2, [r2+4*r3]
%else
    SA8D_INTER
%endif
    mova [esp+64-mmsize], m0
    call pixel_sa8d_8x8_internal
%ifdef HIGH_BIT_DEPTH
    SA8D_INTER
%else ; !HIGH_BIT_DEPTH
    paddusw m0, [esp+64-mmsize]
%if mmsize == 16
    HADDUW m0, m1
%else
    mova m2, [esp+48]
    pxor m7, m7
    mova m1, m0
    mova m3, m2
    punpcklwd m0, m7
    punpckhwd m1, m7
    punpcklwd m2, m7
    punpckhwd m3, m7
    paddd m0, m1
    paddd m2, m3
    paddd m0, m2
    HADDD m0, m1
%endif
%endif ; HIGH_BIT_DEPTH
    movd eax, m0
    add  eax, 1
    shr  eax, 1
    mov  esp, r6
    RET
%endif ; !ARCH_X86_64
%endmacro ; SA8D

;=============================================================================
; INTRA SATD
;=============================================================================

%macro HSUMSUB2 8
    pshufd %4, %2, %7
    pshufd %5, %3, %7
    %1     %2, %8
    %1     %6, %8
    paddw  %2, %4
    paddw  %3, %5
%endmacro

; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
; and are only retained for old cpus.
%macro INTRA_SA8D_SSE2 0
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
cglobal intra_sa8d_x3_8x8, 3,3,14
    ; 8x8 hadamard
    pxor        m8, m8
    movq        m0, [r0+0*FENC_STRIDE]
    movq        m1, [r0+1*FENC_STRIDE]
    movq        m2, [r0+2*FENC_STRIDE]
    movq        m3, [r0+3*FENC_STRIDE]
    movq        m4, [r0+4*FENC_STRIDE]
    movq        m5, [r0+5*FENC_STRIDE]
    movq        m6, [r0+6*FENC_STRIDE]
    movq        m7, [r0+7*FENC_STRIDE]
    punpcklbw   m0, m8
    punpcklbw   m1, m8
    punpcklbw   m2, m8
    punpcklbw   m3, m8
    punpcklbw   m4, m8
    punpcklbw   m5, m8
    punpcklbw   m6, m8
    punpcklbw   m7, m8

    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8

    ABSW2       m8,  m9,  m2, m3, m2, m3
    ABSW2       m10, m11, m4, m5, m4, m5
    paddusw     m8,  m10
    paddusw     m9,  m11
    ABSW2       m10, m11, m6, m7, m6, m7
    ABSW        m13, m1,  m1
    paddusw     m10, m11
    paddusw     m8,  m9
    paddusw     m13, m10
    paddusw     m13, m8

    ; 1D hadamard of edges
    movq        m8,  [r1+7]
    movq        m9,  [r1+16]
    pxor        m10, m10
    punpcklbw   m8,  m10
    punpcklbw   m9,  m10
    HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
    HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
    pshuflw     m10, m8,  q2301
    pshuflw     m11, m9,  q2301
    pshufhw     m10, m10, q2301
    pshufhw     m11, m11, q2301
    pmullw      m8,  [pw_pmpmpmpm]
    pmullw      m11, [pw_pmpmpmpm]
    paddw       m8,  m10
    paddw       m9,  m11

    ; differences
    paddw       m10, m8, m9
    paddw       m10, [pw_8]
    pand        m10, [sw_f0]
    psllw       m10, 2 ; dc

    psllw       m8,  3 ; left edge
    psubw       m8,  m0
    psubw       m10, m0
    ABSW2       m8, m10, m8, m10, m11, m12 ; 1x8 sum
    paddusw     m8,  m13
    paddusw     m13, m10
    punpcklwd   m0,  m1
    punpcklwd   m2,  m3
    punpcklwd   m4,  m5
    punpcklwd   m6,  m7
    punpckldq   m0,  m2
    punpckldq   m4,  m6
    punpcklqdq  m0,  m4 ; transpose
    psllw       m9,  3 ; top edge
    psrldq      m2,  m13, 2 ; 8x7 sum
    psubw       m0,  m9  ; 8x1 sum
    ABSW        m0,  m0,  m9
    paddusw     m2,  m0

    ; 3x HADDW
    movdqa      m7,  [pw_1]
    pmaddwd     m2,  m7
    pmaddwd     m8,  m7
    pmaddwd     m13, m7
    punpckhdq   m3,  m2, m8
    punpckldq   m2,  m8
    pshufd      m5,  m13, q3311
    paddd       m2,  m3
    paddd       m5,  m13
    punpckhqdq  m0,  m2, m5
    punpcklqdq  m2,  m5
    pavgw       m0,  m2
    pxor        m1,  m1
    pavgw       m0,  m1
    movq      [r2], m0 ; i8x8_v, i8x8_h
    psrldq      m0, 8
    movd    [r2+8], m0 ; i8x8_dc
    RET
%endif ; ARCH_X86_64
%endmacro ; INTRA_SA8D_SSE2

; in: r0 = fenc
; out: m0..m3 = hadamard coefs
INIT_MMX
cglobal hadamard_load
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
    pxor        m7, m7
    movd        m0, [r0+0*FENC_STRIDE]
    movd        m1, [r0+1*FENC_STRIDE]
    movd        m2, [r0+2*FENC_STRIDE]
    movd        m3, [r0+3*FENC_STRIDE]
    punpcklbw   m0, m7
    punpcklbw   m1, m7
    punpcklbw   m2, m7
    punpcklbw   m3, m7
    HADAMARD4_2D 0, 1, 2, 3, 4
    SAVE_MM_PERMUTATION
    ret

%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
%ifidn %1, top
    movd        %3, [r1+%2-FDEC_STRIDE]
    pxor        %5, %5
    punpcklbw   %3, %5
%else ; left
%ifnidn %2, 0
    shl         %2d, 5 ; log(FDEC_STRIDE)
%endif
    movd        %3, [r1+%2-4+1*FDEC_STRIDE]
    pinsrw      %3, [r1+%2-2+0*FDEC_STRIDE], 0
    pinsrw      %3, [r1+%2-2+2*FDEC_STRIDE], 2
    pinsrw      %3, [r1+%2-2+3*FDEC_STRIDE], 3
    psrlw       %3, 8
%ifnidn %2, 0
    shr         %2d, 5
%endif
%endif ; direction
%if cpuflag(ssse3)
    %define %%sign psignw
%else
    %define %%sign pmullw
%endif
    pshufw      %4, %3, q1032
    %%sign      %4, [pw_ppmmppmm]
    paddw       %3, %4
    pshufw      %4, %3, q2301
    %%sign      %4, [pw_pmpmpmpm]
    paddw       %3, %4
    psllw       %3, 2
    mova        [%1_1d+2*%2], %3
%endmacro

%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
    pxor        %7, %7
    pshufw      %4, %1, q1032
    pshufw      %5, %2, q1032
    pshufw      %6, %3, q1032
    paddw       %1, %4
    paddw       %2, %5
    paddw       %3, %6
    punpcklwd   %1, %7
    punpcklwd   %2, %7
    punpcklwd   %3, %7
    pshufw      %4, %1, q1032
    pshufw      %5, %2, q1032
    pshufw      %6, %3, q1032
    %8          %1, %4
    %8          %2, %5
    %8          %3, %6
%endmacro

%macro CLEAR_SUMS 0
%ifdef ARCH_X86_64
    mov   qword [sums+0], 0
    mov   qword [sums+8], 0
    mov   qword [sums+16], 0
%else
    pxor  m7, m7
    movq  [sums+0], m7
    movq  [sums+8], m7
    movq  [sums+16], m7
%endif
%endmacro

; in: m1..m3
; out: m7
; clobber: m4..m6
%macro SUM3x4 0
    ABSW2       m4, m5, m1, m2, m1, m2
    ABSW        m7, m3, m3
    paddw       m4, m5
    paddw       m7, m4
%endmacro

; in: m0..m3 (4x4)
; out: m0 v, m4 h, m5 dc
; clobber: m1..m3
%macro SUM4x3 3 ; dc, left, top
    movq        m4, %2
%ifid %1
    movq        m5, %1
%else
    movd        m5, %1
%endif
    psubw       m4, m0
    psubw       m5, m0
    punpcklwd   m0, m1
    punpcklwd   m2, m3
    punpckldq   m0, m2 ; transpose
    psubw       m0, %3
    ABSW2       m4, m5, m4, m5, m2, m3 ; 1x4 sum
    ABSW        m0, m0, m1 ; 4x1 sum
%endmacro

%macro INTRA_X3_MMX 0
;-----------------------------------------------------------------------------
; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_4x4, 3,3
%ifdef ARCH_X86_64
    ; stack is 16 byte aligned because abi says so
    %define  top_1d  rsp-8  ; size 8
    %define  left_1d rsp-16 ; size 8
%else
    ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
    SUB         esp, 16
    %define  top_1d  esp+8
    %define  left_1d esp
%endif

    call hadamard_load
    SCALAR_HADAMARD left, 0, m4, m5
    SCALAR_HADAMARD top,  0, m6, m5, m7
    paddw       m6, m4
    pavgw       m6, [pw_16]
    pand        m6, [sw_f0] ; dc

    SUM3x4
    SUM4x3 m6, [left_1d], [top_1d]
    paddw       m4, m7
    paddw       m5, m7
    movq        m1, m5
    psrlq       m1, 16  ; 4x3 sum
    paddw       m0, m1

    SUM_MM_X3   m0, m4, m5, m1, m2, m3, m6, pavgw
    movd        [r2+0], m0 ; i4x4_v satd
    movd        [r2+4], m4 ; i4x4_h satd
    movd        [r2+8], m5 ; i4x4_dc satd
%ifndef ARCH_X86_64
    ADD         esp, 16
%endif
    RET

%ifdef ARCH_X86_64
    %define  t0 r10
    %define  t2 r11
%else
    %define  t0 r0
    %define  t2 r2
%endif

;-----------------------------------------------------------------------------
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_16x16, 0,5
    %assign  stack_pad  88 + ((stack_offset+88+gprsize)&15)
    ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
    SUB         rsp, stack_pad
%define sums    rsp+64 ; size 24
%define top_1d  rsp+32 ; size 32
%define left_1d rsp    ; size 32
    movifnidn   r1,  r1mp
    CLEAR_SUMS

    ; 1D hadamards
    mov         t0d, 12
    movd        m6,  [pw_64]
.loop_edge:
    SCALAR_HADAMARD left, t0, m0, m1
    SCALAR_HADAMARD top,  t0, m1, m2, m3
    paddw       m6,  m0
    paddw       m6,  m1
    sub         t0d, 4
    jge .loop_edge
    psrlw       m6,  3
    pand        m6,  [sw_f0] ; dc

    ; 2D hadamards
    movifnidn   r0,  r0mp
    mov         r3,  -4
.loop_y:
    mov         r4,  -4
.loop_x:
    call hadamard_load

    SUM3x4
    SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
    pavgw       m4, m7
    pavgw       m5, m7
    paddw       m0, [sums+0]  ; i16x16_v satd
    paddw       m4, [sums+8]  ; i16x16_h satd
    paddw       m5, [sums+16] ; i16x16_dc satd
    movq        [sums+0], m0
    movq        [sums+8], m4
    movq        [sums+16], m5

    add         r0, 4
    inc         r4
    jl  .loop_x
    add         r0, 4*FENC_STRIDE-16
    inc         r3
    jl  .loop_y

; horizontal sum
    movifnidn   r2, r2mp
    movq        m2, [sums+16]
    movq        m1, [sums+8]
    movq        m0, [sums+0]
    movq        m7, m2
    SUM_MM_X3   m0, m1, m2, m3, m4, m5, m6, paddd
    psrld       m0, 1
    pslld       m7, 16
    psrld       m7, 16
    paddd       m0, m2
    psubd       m0, m7
    movd        [r2+8], m2 ; i16x16_dc satd
    movd        [r2+4], m1 ; i16x16_h satd
    movd        [r2+0], m0 ; i16x16_v satd
    ADD         rsp, stack_pad
    RET

;-----------------------------------------------------------------------------
; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_8x8c, 0,6
    ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
    SUB          rsp, 72
%define  sums    rsp+48 ; size 24
%define  dc_1d   rsp+32 ; size 16
%define  top_1d  rsp+16 ; size 16
%define  left_1d rsp    ; size 16
    movifnidn   r1,  r1mp
    CLEAR_SUMS

    ; 1D hadamards
    mov         t0d, 4
.loop_edge:
    SCALAR_HADAMARD left, t0, m0, m1
    SCALAR_HADAMARD top,  t0, m0, m1, m2
    sub         t0d, 4
    jge .loop_edge

    ; dc
    movzx       t2d, word [left_1d+0]
    movzx       r3d, word [top_1d+0]
    movzx       r4d, word [left_1d+8]
    movzx       r5d, word [top_1d+8]
    lea         t2d, [t2 + r3 + 16]
    lea         r3d, [r4 + r5 + 16]
    shr         t2d, 1
    shr         r3d, 1
    add         r4d, 8
    add         r5d, 8
    and         t2d, -16 ; tl
    and         r3d, -16 ; br
    and         r4d, -16 ; bl
    and         r5d, -16 ; tr
    mov         [dc_1d+ 0], t2d ; tl
    mov         [dc_1d+ 4], r5d ; tr
    mov         [dc_1d+ 8], r4d ; bl
    mov         [dc_1d+12], r3d ; br
    lea         r5, [dc_1d]

    ; 2D hadamards
    movifnidn   r0,  r0mp
    movifnidn   r2,  r2mp
    mov         r3,  -2
.loop_y:
    mov         r4,  -2
.loop_x:
    call hadamard_load

    SUM3x4
    SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)]
    pavgw       m4, m7
    pavgw       m5, m7
    paddw       m0, [sums+16] ; i4x4_v satd
    paddw       m4, [sums+8]  ; i4x4_h satd
    paddw       m5, [sums+0]  ; i4x4_dc satd
    movq        [sums+16], m0
    movq        [sums+8], m4
    movq        [sums+0], m5

    add         r0, 4
    inc         r4
    jl  .loop_x
    add         r0, 4*FENC_STRIDE-8
    add         r5, 8
    inc         r3
    jl  .loop_y

; horizontal sum
    movq        m0, [sums+0]
    movq        m1, [sums+8]
    movq        m2, [sums+16]
    movq        m7, m0
    psrlq       m7, 15
    paddw       m2, m7
    SUM_MM_X3   m0, m1, m2, m3, m4, m5, m6, paddd
    psrld       m2, 1
    movd        [r2+0], m0 ; i8x8c_dc satd
    movd        [r2+4], m1 ; i8x8c_h satd
    movd        [r2+8], m2 ; i8x8c_v satd
    ADD         rsp, 72
    RET
%endmacro ; INTRA_X3_MMX



%macro PRED4x4_LOWPASS 5
%ifid %5
    pavgb       %5, %2, %3
    pxor        %3, %2
    pand        %3, [pb_1]
    psubusb     %5, %3
    pavgb       %1, %4, %5
%else
    mova        %5, %2
    pavgb       %2, %3
    pxor        %3, %5
    pand        %3, [pb_1]
    psubusb     %2, %3
    pavgb       %1, %4, %2
%endif
%endmacro

%macro INTRA_X9_PRED 2
%if cpuflag(sse4)
    movu       m1, [r1-1*FDEC_STRIDE-8]
    pinsrb     m1, [r1+3*FDEC_STRIDE-1], 0
    pinsrb     m1, [r1+2*FDEC_STRIDE-1], 1
    pinsrb     m1, [r1+1*FDEC_STRIDE-1], 2
    pinsrb     m1, [r1+0*FDEC_STRIDE-1], 3
%else
    movd      mm0, [r1+3*FDEC_STRIDE-4]
    punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
    movd      mm1, [r1+1*FDEC_STRIDE-4]
    punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
    punpckhwd mm0, mm1
    psrlq     mm0, 32
    movq2dq    m0, mm0
    movu       m1, [r1-1*FDEC_STRIDE-8]
    movss      m1, m0                  ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
%endif ; cpuflag
    pshufb     m1, [intrax9_edge]      ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
    psrldq     m0, m1, 1               ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
    psrldq     m2, m1, 2               ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
    pavgb      m5, m0, m1              ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5  __  __ __ __ __
    mova       %2, m1
    PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
    ; ddl               ddr
    ; Ft1 Ft2 Ft3 Ft4   Flt Ft0 Ft1 Ft2
    ; Ft2 Ft3 Ft4 Ft5   Fl0 Flt Ft0 Ft1
    ; Ft3 Ft4 Ft5 Ft6   Fl1 Fl0 Flt Ft0
    ; Ft4 Ft5 Ft6 Ft7   Fl2 Fl1 Fl0 Flt
    pshufb     m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
    pshufb     m3, m0, [%1_ddlr2] ; rows 2,3
    ; hd                hu
    ; Glt Flt Ft0 Ft1   Gl0 Fl1 Gl1 Fl2
    ; Gl0 Fl0 Glt Flt   Gl1 Fl2 Gl2 Fl3
    ; Gl1 Fl1 Gl0 Fl0   Gl2 Fl3 Gl3 Gl3
    ; Gl2 Fl2 Gl1 Fl1   Gl3 Gl3 Gl3 Gl3
    pslldq     m0, 5                   ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
    palignr    m7, m5, m0, 5           ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
    pshufb     m6, m7, [%1_hdu1]
    pshufb     m7, m7, [%1_hdu2]
    ; vr                vl
    ; Gt0 Gt1 Gt2 Gt3   Gt1 Gt2 Gt3 Gt4
    ; Flt Ft0 Ft1 Ft2   Ft1 Ft2 Ft3 Ft4
    ; Fl0 Gt0 Gt1 Gt2   Gt2 Gt3 Gt4 Gt5
    ; Fl1 Flt Ft0 Ft1   Ft2 Ft3 Ft4 Ft5
    psrldq     m5, 5                   ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
    palignr    m5, m0, 6               ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
    pshufb     m4, m5, [%1_vrl1]
    pshufb     m5, m5, [%1_vrl2]
%endmacro ; INTRA_X9_PRED

%macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
    pshufb     m2, m%1, [intrax9b_vh1]
    pshufb     m3, m%1, [intrax9b_vh2]
    mova      [pred_buf+0x60], m2
    mova      [pred_buf+0x70], m3
    pshufb    m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
    pmaddubsw m%1, [hmul_4p]
    pshufhw    m0, m%1, q2301
    pshuflw    m0, m0,  q2301
    psignw    m%1, [pw_pmpmpmpm]
    paddw      m0, m%1
    psllw      m0, 2 ; hadamard(top), hadamard(left)
    movhlps    m3, m0
    pshufb     m1, m0, [intrax9b_v1]
    pshufb     m2, m0, [intrax9b_v2]
    paddw      m0, m3
    psignw     m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
    pavgw      m0, [pw_16]
    pand       m0, [sw_f0] ; dc
    ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
    ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
    ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
    HADAMARD 0, sumsub, %2, %3, %4, %5
    HADAMARD 1, sumsub, %2, %3, %4, %5
    movd      r3d, m0
    shr       r3d, 4
    imul      r3d, 0x01010101
    mov       [pred_buf+0x80], r3d
    mov       [pred_buf+0x88], r3d
    mov       [pred_buf+0x90], r3d
    mov       [pred_buf+0x98], r3d
    psubw      m3, m%2
    psubw      m0, m%2
    psubw      m1, m%2
    psubw      m2, m%3
    pabsw     m%3, m%3
    pabsw      m3, m3
    pabsw      m0, m0
    pabsw      m1, m1
    pabsw      m2, m2
    pavgw      m3, m%3
    pavgw      m0, m%3
    pavgw      m1, m2
%if cpuflag(sse4)
    phaddw     m3, m0
%else
    SBUTTERFLY qdq, 3, 0, 2
    paddw      m3, m0
%endif
    movhlps    m2, m1
    paddw      m1, m2
%if cpuflag(xop)
    vphaddwq   m3, m3
    vphaddwq   m1, m1
    packssdw   m1, m3
%else
    phaddw     m1, m3
    pmaddwd    m1, [pw_1] ; v, _, h, dc
%endif
%endmacro ; INTRA_X9_VHDC

%macro INTRA_X9_END 2
%if cpuflag(sse4)
    phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
    movd      eax, m0
    add       eax, 1<<16
    cmp        ax, r3w
    cmovge    eax, r3d
%else
%if %1
    ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
    psllw      m0, 3
    paddw      m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
%else
    ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
    psllw      m0, 2
    paddusw    m0, m0
    paddw      m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
%endif
    movhlps    m1, m0
    pminsw     m0, m1
    pshuflw    m1, m0, q0032
    pminsw     m0, m1
    pshuflw    m1, m0, q0001
    pminsw     m0, m1
    movd      eax, m0
    movsx     r2d, ax
    and       eax, 7
    sar       r2d, 3
    shl       eax, 16
    ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
    ; 1<<12: undo sign manipulation
    lea       eax, [rax+r2+(1<<16)+(1<<12)]
    cmp        ax, r3w
    cmovge    eax, r3d
%endif ; cpuflag

    ; output the predicted samples
    mov       r3d, eax
    shr       r3d, 16
%ifdef PIC
    lea        r2, [%2_lut]
    movzx     r2d, byte [r2+r3]
%else
    movzx     r2d, byte [%2_lut+r3]
%endif
%if %1 ; sad
    movq      mm0, [pred_buf+r2]
    movq      mm1, [pred_buf+r2+16]
    movd     [r1+0*FDEC_STRIDE], mm0
    movd     [r1+2*FDEC_STRIDE], mm1
    psrlq     mm0, 32
    psrlq     mm1, 32
    movd     [r1+1*FDEC_STRIDE], mm0
    movd     [r1+3*FDEC_STRIDE], mm1
%else ; satd
%assign i 0
%rep 4
    mov       r3d, [pred_buf+r2+8*i]
    mov      [r1+i*FDEC_STRIDE], r3d
%assign i i+1
%endrep
%endif
%endmacro ; INTRA_X9_END

%macro INTRA_X9 0
;-----------------------------------------------------------------------------
; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
%if notcpuflag(xop)
cglobal intra_sad_x9_4x4, 3,4,9
    %assign pad 0xc0-gprsize-(stack_offset&15)
    %define pred_buf rsp
    sub       rsp, pad
%ifdef ARCH_X86_64
    INTRA_X9_PRED intrax9a, m8
%else
    INTRA_X9_PRED intrax9a, [rsp+0xa0]
%endif
    mova [rsp+0x00], m2
    mova [rsp+0x10], m3
    mova [rsp+0x20], m4
    mova [rsp+0x30], m5
    mova [rsp+0x40], m6
    mova [rsp+0x50], m7
%if cpuflag(sse4)
    movd       m0, [r0+0*FENC_STRIDE]
    pinsrd     m0, [r0+1*FENC_STRIDE], 1
    movd       m1, [r0+2*FENC_STRIDE]
    pinsrd     m1, [r0+3*FENC_STRIDE], 1
%else
    movd      mm0, [r0+0*FENC_STRIDE]
    punpckldq mm0, [r0+1*FENC_STRIDE]
    movd      mm1, [r0+2*FENC_STRIDE]
    punpckldq mm1, [r0+3*FENC_STRIDE]
    movq2dq    m0, mm0
    movq2dq    m1, mm1
%endif
    punpcklqdq m0, m0
    punpcklqdq m1, m1
    psadbw     m2, m0
    psadbw     m3, m1
    psadbw     m4, m0
    psadbw     m5, m1
    psadbw     m6, m0
    psadbw     m7, m1
    paddd      m2, m3
    paddd      m4, m5
    paddd      m6, m7
%ifdef ARCH_X86_64
    SWAP        7, 8
    pxor       m8, m8
    %define %%zero m8
%else
    mova       m7, [rsp+0xa0]
    %define %%zero [pb_0]
%endif
    pshufb     m3, m7, [intrax9a_vh1]
    pshufb     m5, m7, [intrax9a_vh2]
    pshufb     m7, [intrax9a_dc]
    psadbw     m7, %%zero
    psrlw      m7, 2
    mova [rsp+0x60], m3
    mova [rsp+0x70], m5
    psadbw     m3, m0
    pavgw      m7, %%zero
    pshufb     m7, %%zero
    psadbw     m5, m1
    movq [rsp+0x80], m7
    movq [rsp+0x90], m7
    psadbw     m0, m7
    paddd      m3, m5
    psadbw     m1, m7
    paddd      m0, m1
    movzx     r3d, word [r2]
    movd      r0d, m3 ; v
    add       r3d, r0d
    punpckhqdq m3, m0 ; h, dc
    shufps     m3, m2, q2020
    psllq      m6, 32
    por        m4, m6
    movu       m0, [r2+2]
    packssdw   m3, m4
    paddw      m0, m3
    INTRA_X9_END 1, intrax9a
    add       rsp, pad
    RET
%endif ; cpuflag

%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
cglobal intra_satd_x9_4x4, 3,4,16
    %assign pad 0xb0-gprsize-(stack_offset&15)
    %define pred_buf rsp
    sub       rsp, pad
    INTRA_X9_PRED intrax9b, m15
    mova [rsp+0x00], m2
    mova [rsp+0x10], m3
    mova [rsp+0x20], m4
    mova [rsp+0x30], m5
    mova [rsp+0x40], m6
    mova [rsp+0x50], m7
    movd       m8, [r0+0*FENC_STRIDE]
    movd       m9, [r0+1*FENC_STRIDE]
    movd      m10, [r0+2*FENC_STRIDE]
    movd      m11, [r0+3*FENC_STRIDE]
    mova      m12, [hmul_8p]
    pshufd     m8, m8, 0
    pshufd     m9, m9, 0
    pshufd    m10, m10, 0
    pshufd    m11, m11, 0
    pmaddubsw  m8, m12
    pmaddubsw  m9, m12
    pmaddubsw m10, m12
    pmaddubsw m11, m12
    movddup    m0, m2
    pshufd     m1, m2, q3232
    movddup    m2, m3
    movhlps    m3, m3
    call .satd_8x4 ; ddr, ddl
    movddup    m2, m5
    pshufd     m3, m5, q3232
    mova       m5, m0
    movddup    m0, m4
    pshufd     m1, m4, q3232
    call .satd_8x4 ; vr, vl
    movddup    m2, m7
    pshufd     m3, m7, q3232
    mova       m4, m0
    movddup    m0, m6
    pshufd     m1, m6, q3232
    call .satd_8x4 ; hd, hu
%if cpuflag(sse4)
    punpckldq  m4, m0
%else
    punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
%endif
    mova       m1, [pw_ppmmppmm]
    psignw     m8, m1
    psignw    m10, m1
    paddw      m8, m9
    paddw     m10, m11
    INTRA_X9_VHDC 15, 8, 10, 6, 7
    ; find minimum
    movu       m0, [r2+2]
    movd      r3d, m1
    palignr    m5, m1, 8
%if notcpuflag(sse4)
    pshufhw    m0, m0, q3120 ; compensate for different order in unpack
%endif
    packssdw   m5, m4
    paddw      m0, m5
    movzx     r0d, word [r2]
    add       r3d, r0d
    INTRA_X9_END 0, intrax9b
    add       rsp, pad
    RET
RESET_MM_PERMUTATION
ALIGN 16
.satd_8x4:
    pmaddubsw  m0, m12
    pmaddubsw  m1, m12
    pmaddubsw  m2, m12
    pmaddubsw  m3, m12
    psubw      m0, m8
    psubw      m1, m9
    psubw      m2, m10
    psubw      m3, m11
    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap
    pmaddwd    m0, [pw_1]
%if cpuflag(sse4)
    pshufd     m1, m0, q0032
%else
    movhlps    m1, m0
%endif
    paddd    xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
    ret

%else ; !ARCH_X86_64
cglobal intra_satd_x9_4x4, 3,4,8
    %assign pad 0x120-gprsize-(stack_offset&15)
    %define fenc_buf rsp
    %define pred_buf rsp+0x40
    %define spill    rsp+0xe0
    sub       rsp, pad
    INTRA_X9_PRED intrax9b, [spill+0x20]
    mova [pred_buf+0x00], m2
    mova [pred_buf+0x10], m3
    mova [pred_buf+0x20], m4
    mova [pred_buf+0x30], m5
    mova [pred_buf+0x40], m6
    mova [pred_buf+0x50], m7
    movd       m4, [r0+0*FENC_STRIDE]
    movd       m5, [r0+1*FENC_STRIDE]
    movd       m6, [r0+2*FENC_STRIDE]
    movd       m0, [r0+3*FENC_STRIDE]
    mova       m7, [hmul_8p]
    pshufd     m4, m4, 0
    pshufd     m5, m5, 0
    pshufd     m6, m6, 0
    pshufd     m0, m0, 0
    pmaddubsw  m4, m7
    pmaddubsw  m5, m7
    pmaddubsw  m6, m7
    pmaddubsw  m0, m7
    mova [fenc_buf+0x00], m4
    mova [fenc_buf+0x10], m5
    mova [fenc_buf+0x20], m6
    mova [fenc_buf+0x30], m0
    movddup    m0, m2
    pshufd     m1, m2, q3232
    movddup    m2, m3
    movhlps    m3, m3
    pmaddubsw  m0, m7
    pmaddubsw  m1, m7
    pmaddubsw  m2, m7
    pmaddubsw  m3, m7
    psubw      m0, m4
    psubw      m1, m5
    psubw      m2, m6
    call .satd_8x4b ; ddr, ddl
    mova       m3, [pred_buf+0x30]
    mova       m1, [pred_buf+0x20]
    movddup    m2, m3
    movhlps    m3, m3
    movq [spill+0x08], m0
    movddup    m0, m1
    movhlps    m1, m1
    call .satd_8x4 ; vr, vl
    mova       m3, [pred_buf+0x50]
    mova       m1, [pred_buf+0x40]
    movddup    m2, m3
    movhlps    m3, m3
    movq [spill+0x10], m0
    movddup    m0, m1
    movhlps    m1, m1
    call .satd_8x4 ; hd, hu
    movq [spill+0x18], m0
    mova       m1, [spill+0x20]
    mova       m4, [fenc_buf+0x00]
    mova       m5, [fenc_buf+0x20]
    mova       m2, [pw_ppmmppmm]
    psignw     m4, m2
    psignw     m5, m2
    paddw      m4, [fenc_buf+0x10]
    paddw      m5, [fenc_buf+0x30]
    INTRA_X9_VHDC 1, 4, 5, 6, 7
    ; find minimum
    movu       m0, [r2+2]
    movd      r3d, m1
    punpckhqdq m1, [spill+0x00]
    packssdw   m1, [spill+0x10]
%if cpuflag(sse4)
    pshufhw    m1, m1, q3120
%else
    pshufhw    m0, m0, q3120
%endif
    paddw      m0, m1
    movzx     r0d, word [r2]
    add       r3d, r0d
    INTRA_X9_END 0, intrax9b
    add       rsp, pad
    RET
RESET_MM_PERMUTATION
ALIGN 16
.satd_8x4:
    pmaddubsw  m0, m7
    pmaddubsw  m1, m7
    pmaddubsw  m2, m7
    pmaddubsw  m3, m7
    %xdefine fenc_buf fenc_buf+gprsize
    psubw      m0, [fenc_buf+0x00]
    psubw      m1, [fenc_buf+0x10]
    psubw      m2, [fenc_buf+0x20]
.satd_8x4b:
    psubw      m3, [fenc_buf+0x30]
    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap
    pmaddwd    m0, [pw_1]
%if cpuflag(sse4)
    pshufd     m1, m0, q0032
%else
    movhlps    m1, m0
%endif
    paddd    xmm0, m0, m1
    ret
%endif ; ARCH
%endmacro ; INTRA_X9



%macro INTRA8_X9 0
;-----------------------------------------------------------------------------
; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
;-----------------------------------------------------------------------------
cglobal intra_sad_x9_8x8, 5,6,9
    %define fenc02 m4
    %define fenc13 m5
    %define fenc46 m6
    %define fenc57 m7
%ifdef ARCH_X86_64
    %define tmp m8
    %assign padbase 0x0
%else
    %define tmp [rsp]
    %assign padbase 0x10
%endif
    %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
    %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]

    SUB        rsp, pad
    movq    fenc02, [r0+FENC_STRIDE* 0]
    movq    fenc13, [r0+FENC_STRIDE* 1]
    movq    fenc46, [r0+FENC_STRIDE* 4]
    movq    fenc57, [r0+FENC_STRIDE* 5]
    movhps  fenc02, [r0+FENC_STRIDE* 2]
    movhps  fenc13, [r0+FENC_STRIDE* 3]
    movhps  fenc46, [r0+FENC_STRIDE* 6]
    movhps  fenc57, [r0+FENC_STRIDE* 7]

    ; save instruction size: avoid 4-byte memory offsets
    lea         r0, [intra8x9_h1+128]
    %define off(m) (r0+m-(intra8x9_h1+128))

; v
    movddup     m0, [r2+16]
    mova pred(0,0), m0
    psadbw      m1, m0, fenc02
    mova pred(0,1), m0
    psadbw      m2, m0, fenc13
    mova pred(0,2), m0
    psadbw      m3, m0, fenc46
    mova pred(0,3), m0
    psadbw      m0, m0, fenc57
    paddw       m1, m2
    paddw       m0, m3
    paddw       m0, m1
    movhlps     m1, m0
    paddw       m0, m1
    movd    [r4+0], m0

; h
    movq        m0, [r2+7]
    pshufb      m1, m0, [off(intra8x9_h1)]
    pshufb      m2, m0, [off(intra8x9_h2)]
    mova pred(1,0), m1
    psadbw      m1, fenc02
    mova pred(1,1), m2
    psadbw      m2, fenc13
    paddw       m1, m2
    pshufb      m3, m0, [off(intra8x9_h3)]
    pshufb      m2, m0, [off(intra8x9_h4)]
    mova pred(1,2), m3
    psadbw      m3, fenc46
    mova pred(1,3), m2
    psadbw      m2, fenc57
    paddw       m1, m3
    paddw       m1, m2
    movhlps     m2, m1
    paddw       m1, m2
    movd    [r4+2], m1

    lea         r5, [rsp+padbase+0x100]
    %define pred(i,j) [r5+i*0x40+j*0x10-0x100]

; dc
    movhps      m0, [r2+16]
    pxor        m2, m2
    psadbw      m0, m2
    movhlps     m1, m0
    paddw       m0, m1
    psrlw       m0, 3
    pavgw       m0, m2
    pshufb      m0, m2
    mova pred(2,0), m0
    psadbw      m1, m0, fenc02
    mova pred(2,1), m0
    psadbw      m2, m0, fenc13
    mova pred(2,2), m0
    psadbw      m3, m0, fenc46
    mova pred(2,3), m0
    psadbw      m0, m0, fenc57
    paddw       m1, m2
    paddw       m0, m3
    paddw       m0, m1
    movhlps     m1, m0
    paddw       m0, m1
    movd    [r4+4], m0

; ddl
; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
    mova        m0, [r2+16]
    movu        m2, [r2+17]
    pslldq      m1, m0, 1
    pavgb       m3, m0, m2              ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
    PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
    pshufb      m1, m0, [off(intra8x9_ddl1)]
    pshufb      m2, m0, [off(intra8x9_ddl2)]
    mova pred(3,0), m1
    psadbw      m1, fenc02
    mova pred(3,1), m2
    psadbw      m2, fenc13
    paddw       m1, m2
    pshufb      m2, m0, [off(intra8x9_ddl3)]
    mova pred(3,2), m2
    psadbw      m2, fenc46
    paddw       m1, m2
    pshufb      m2, m0, [off(intra8x9_ddl4)]
    mova pred(3,3), m2
    psadbw      m2, fenc57
    paddw       m1, m2
    movhlps     m2, m1
    paddw       m1, m2
    movd    [r4+6], m1

; vl
; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
    pshufb      m1, m3, [off(intra8x9_vl1)]
    pshufb      m2, m0, [off(intra8x9_vl2)]
    pshufb      m3, m3, [off(intra8x9_vl3)]
    pshufb      m0, m0, [off(intra8x9_vl4)]
    mova pred(7,0), m1
    psadbw      m1, fenc02
    mova pred(7,1), m2
    psadbw      m2, fenc13
    mova pred(7,2), m3
    psadbw      m3, fenc46
    mova pred(7,3), m0
    psadbw      m0, fenc57
    paddw       m1, m2
    paddw       m0, m3
    paddw       m0, m1
    movhlps     m1, m0
    paddw       m0, m1
%if cpuflag(sse4)
    pextrw [r4+14], m0, 0
%else
    movd       r5d, m0
    mov    [r4+14], r5w
    lea         r5, [rsp+padbase+0x100]
%endif

; ddr
; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
    movu        m2, [r2+8]
    movu        m0, [r2+7]
    movu        m1, [r2+6]
    pavgb       m3, m2, m0              ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
    PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
    pshufb      m1, m0, [off(intra8x9_ddr1)]
    pshufb      m2, m0, [off(intra8x9_ddr2)]
    mova pred(4,0), m1
    psadbw      m1, fenc02
    mova pred(4,1), m2
    psadbw      m2, fenc13
    paddw       m1, m2
    pshufb      m2, m0, [off(intra8x9_ddr3)]
    mova pred(4,2), m2
    psadbw      m2, fenc46
    paddw       m1, m2
    pshufb      m2, m0, [off(intra8x9_ddr4)]
    mova pred(4,3), m2
    psadbw      m2, fenc57
    paddw       m1, m2
    movhlps     m2, m1
    paddw       m1, m2
    movd    [r4+8], m1

    add         r0, 256
    add         r5, 0xC0
    %define off(m) (r0+m-(intra8x9_h1+256+128))
    %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]

; vr
; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
    movsd       m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
    pshufb      m1, m2, [off(intra8x9_vr1)]
    pshufb      m2, m2, [off(intra8x9_vr3)]
    mova pred(5,0), m1
    psadbw      m1, fenc02
    mova pred(5,2), m2
    psadbw      m2, fenc46
    paddw       m1, m2
    pshufb      m2, m0, [off(intra8x9_vr2)]
    mova pred(5,1), m2
    psadbw      m2, fenc13
    paddw       m1, m2
    pshufb      m2, m0, [off(intra8x9_vr4)]
    mova pred(5,3), m2
    psadbw      m2, fenc57
    paddw       m1, m2
    movhlps     m2, m1
    paddw       m1, m2
    movd   [r4+10], m1

; hd
; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
    pshufd      m2, m3, q0001
%if cpuflag(sse4)
    pblendw     m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
%else
    movss       m1, m0, m2
    SWAP        1, 2
%endif
    punpcklbw   m0, m3        ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
    pshufb      m1, m2, [off(intra8x9_hd1)]
    pshufb      m2, m2, [off(intra8x9_hd2)]
    mova pred(6,0), m1
    psadbw      m1, fenc02
    mova pred(6,1), m2
    psadbw      m2, fenc13
    paddw       m1, m2
    pshufb      m2, m0, [off(intra8x9_hd3)]
    pshufb      m3, m0, [off(intra8x9_hd4)]
    mova pred(6,2), m2
    psadbw      m2, fenc46
    mova pred(6,3), m3
    psadbw      m3, fenc57
    paddw       m1, m2
    paddw       m1, m3
    movhlps     m2, m1
    paddw       m1, m2
    ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
    pslldq      m1, 12
    SWAP        3, 1

; hu
; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
%if cpuflag(sse4)
    pinsrb      m0, [r2+7], 15 ; Gl7
%else
    movd        m1, [r2+7]
    pslldq      m0, 1
    palignr     m1, m0, 1
    SWAP        0, 1
%endif
    pshufb      m1, m0, [off(intra8x9_hu1)]
    pshufb      m2, m0, [off(intra8x9_hu2)]
    mova pred(8,0), m1
    psadbw      m1, fenc02
    mova pred(8,1), m2
    psadbw      m2, fenc13
    paddw       m1, m2
    pshufb      m2, m0, [off(intra8x9_hu3)]
    pshufb      m0, m0, [off(intra8x9_hu4)]
    mova pred(8,2), m2
    psadbw      m2, fenc46
    mova pred(8,3), m0
    psadbw      m0, fenc57
    paddw       m1, m2
    paddw       m1, m0
    movhlps     m2, m1
    paddw       m1, m2
    movd       r2d, m1

    movu        m0, [r3]
    por         m3, [r4]
    paddw       m0, m3
    mova      [r4], m0
    movzx      r5d, word [r3+16]
    add        r2d, r5d
    mov    [r4+16], r2w

%if cpuflag(sse4)
    phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
    movd      eax, m0
%else
    ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
    paddusw    m0, m0
    paddusw    m0, m0
    paddw      m0, [off(pw_s00112233)]
    movhlps    m1, m0
    pminsw     m0, m1
    pshuflw    m1, m0, q0032
    pminsw     m0, m1
    movd      eax, m0
    ; repack with 3 bit index
    xor       eax, 0x80008000
    movzx     r3d, ax
    shr       eax, 15
    add       r3d, r3d
    or        eax, 1
    cmp       eax, r3d
    cmovg     eax, r3d
    ; reverse to phminposuw order
    mov       r3d, eax
    and       eax, 7
    shr       r3d, 3
    shl       eax, 16
    or        eax, r3d
%endif
    add       r2d, 8<<16
    cmp        ax, r2w
    cmovg     eax, r2d

    mov       r2d, eax
    shr       r2d, 16
    shl       r2d, 6
    add        r1, 4*FDEC_STRIDE
    mova       m0, [rsp+padbase+r2+0x00]
    mova       m1, [rsp+padbase+r2+0x10]
    mova       m2, [rsp+padbase+r2+0x20]
    mova       m3, [rsp+padbase+r2+0x30]
    movq   [r1+FDEC_STRIDE*-4], m0
    movhps [r1+FDEC_STRIDE*-2], m0
    movq   [r1+FDEC_STRIDE*-3], m1
    movhps [r1+FDEC_STRIDE*-1], m1
    movq   [r1+FDEC_STRIDE* 0], m2
    movhps [r1+FDEC_STRIDE* 2], m2
    movq   [r1+FDEC_STRIDE* 1], m3
    movhps [r1+FDEC_STRIDE* 3], m3
    ADD       rsp, pad
    RET

%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
;-----------------------------------------------------------------------------
cglobal intra_sa8d_x9_8x8, 5,6,16
    %assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
    %define fenc_buf rsp
    %define pred_buf rsp+0x80
    SUB        rsp, pad
    mova       m15, [hmul_8p]
    pxor        m8, m8
%assign %%i 0
%rep 8
    movddup     m %+ %%i, [r0+%%i*FENC_STRIDE]
    pmaddubsw   m9, m %+ %%i, m15
    punpcklbw   m %+ %%i, m8
    mova [fenc_buf+%%i*0x10], m9
%assign %%i %%i+1
%endrep

    ; save instruction size: avoid 4-byte memory offsets
    lea         r0, [intra8x9_h1+0x80]
    %define off(m) (r0+m-(intra8x9_h1+0x80))
    lea         r5, [pred_buf+0x80]

; v, h, dc
    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
    pabsw      m11, m1
%assign %%i 2
%rep 6
    pabsw       m8, m %+ %%i
    paddw      m11, m8
%assign %%i %%i+1
%endrep

    ; 1D hadamard of edges
    movq        m8, [r2+7]
    movddup     m9, [r2+16]
    mova [r5-0x80], m9
    mova [r5-0x70], m9
    mova [r5-0x60], m9
    mova [r5-0x50], m9
    punpcklwd   m8, m8
    pshufb      m9, [intrax3_shuf]
    pmaddubsw   m8, [pb_pppm]
    pmaddubsw   m9, [pb_pppm]
    HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
    HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]

    ; dc
    paddw      m10, m8, m9
    paddw      m10, [pw_8]
    pand       m10, [sw_f0]
    psrlw      m12, m10, 4
    psllw      m10, 2
    pxor       m13, m13
    pshufb     m12, m13
    mova [r5+0x00], m12
    mova [r5+0x10], m12
    mova [r5+0x20], m12
    mova [r5+0x30], m12

    ; differences
    psllw       m8, 3 ; left edge
    psubw       m8, m0
    psubw      m10, m0
    pabsw       m8, m8 ; 1x8 sum
    pabsw      m10, m10
    paddw       m8, m11
    paddw      m11, m10
    punpcklwd   m0, m1
    punpcklwd   m2, m3
    punpcklwd   m4, m5
    punpcklwd   m6, m7
    punpckldq   m0, m2
    punpckldq   m4, m6
    punpcklqdq  m0, m4 ; transpose
    psllw       m9, 3  ; top edge
    psrldq     m10, m11, 2 ; 8x7 sum
    psubw       m0, m9 ; 8x1 sum
    pabsw       m0, m0
    paddw      m10, m0

    phaddd     m10, m8 ; logically phaddw, but this is faster and it won't overflow
    psrlw      m11, 1
    psrlw      m10, 1

; store h
    movq        m3, [r2+7]
    pshufb      m0, m3, [off(intra8x9_h1)]
    pshufb      m1, m3, [off(intra8x9_h2)]
    pshufb      m2, m3, [off(intra8x9_h3)]
    pshufb      m3, m3, [off(intra8x9_h4)]
    mova [r5-0x40], m0
    mova [r5-0x30], m1
    mova [r5-0x20], m2
    mova [r5-0x10], m3

; ddl
    mova        m8, [r2+16]
    movu        m2, [r2+17]
    pslldq      m1, m8, 1
    pavgb       m9, m8, m2
    PRED4x4_LOWPASS m8, m1, m2, m8, m3
    pshufb      m0, m8, [off(intra8x9_ddl1)]
    pshufb      m1, m8, [off(intra8x9_ddl2)]
    pshufb      m2, m8, [off(intra8x9_ddl3)]
    pshufb      m3, m8, [off(intra8x9_ddl4)]
    add         r5, 0x40
    call .sa8d
    phaddd     m11, m0

; vl
    pshufb      m0, m9, [off(intra8x9_vl1)]
    pshufb      m1, m8, [off(intra8x9_vl2)]
    pshufb      m2, m9, [off(intra8x9_vl3)]
    pshufb      m3, m8, [off(intra8x9_vl4)]
    add         r5, 0x100
    call .sa8d
    phaddd     m10, m11
    mova       m12, m0

; ddr
    movu        m2, [r2+8]
    movu        m8, [r2+7]
    movu        m1, [r2+6]
    pavgb       m9, m2, m8
    PRED4x4_LOWPASS m8, m1, m2, m8, m3
    pshufb      m0, m8, [off(intra8x9_ddr1)]
    pshufb      m1, m8, [off(intra8x9_ddr2)]
    pshufb      m2, m8, [off(intra8x9_ddr3)]
    pshufb      m3, m8, [off(intra8x9_ddr4)]
    sub         r5, 0xc0
    call .sa8d
    mova       m11, m0

    add         r0, 0x100
    %define off(m) (r0+m-(intra8x9_h1+0x180))

; vr
    movsd       m2, m9, m8
    pshufb      m0, m2, [off(intra8x9_vr1)]
    pshufb      m1, m8, [off(intra8x9_vr2)]
    pshufb      m2, m2, [off(intra8x9_vr3)]
    pshufb      m3, m8, [off(intra8x9_vr4)]
    add         r5, 0x40
    call .sa8d
    phaddd     m11, m0

; hd
%if cpuflag(sse4)
    pshufd      m1, m9, q0001
    pblendw     m1, m8, q3330
%else
    pshufd      m2, m9, q0001
    movss       m1, m8, m2
%endif
    punpcklbw   m8, m9
    pshufb      m0, m1, [off(intra8x9_hd1)]
    pshufb      m1, m1, [off(intra8x9_hd2)]
    pshufb      m2, m8, [off(intra8x9_hd3)]
    pshufb      m3, m8, [off(intra8x9_hd4)]
    add         r5, 0x40
    call .sa8d
    phaddd      m0, m12
    phaddd     m11, m0

; hu
%if cpuflag(sse4)
    pinsrb      m8, [r2+7], 15
%else
    movd        m9, [r2+7]
    pslldq      m8, 1
    palignr     m9, m8, 1
    SWAP        8, 9
%endif
    pshufb      m0, m8, [off(intra8x9_hu1)]
    pshufb      m1, m8, [off(intra8x9_hu2)]
    pshufb      m2, m8, [off(intra8x9_hu3)]
    pshufb      m3, m8, [off(intra8x9_hu4)]
    add         r5, 0x80
    call .sa8d

    pmaddwd     m0, [pw_1]
    phaddw     m10, m11
    movhlps     m1, m0
    paddw       m0, m1
    pshuflw     m1, m0, q0032
    pavgw       m0, m1
    pxor        m2, m2
    pavgw      m10, m2
    movd       r2d, m0

    movu        m0, [r3]
    paddw       m0, m10
    mova      [r4], m0
    movzx      r5d, word [r3+16]
    add        r2d, r5d
    mov    [r4+16], r2w

%if cpuflag(sse4)
    phminposuw m0, m0
    movd      eax, m0
%else
    ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
    paddusw    m0, m0
    paddw      m0, [off(pw_s00001111)]
    movhlps    m1, m0
    pminsw     m0, m1
    pshuflw    m1, m0, q0032
    mova       m2, m0
    pminsw     m0, m1
    pcmpgtw    m2, m1 ; 2nd index bit
    movd      r3d, m0
    movd      r4d, m2
    ; repack with 3 bit index
    xor       r3d, 0x80008000
    and       r4d, 0x00020002
    movzx     eax, r3w
    movzx     r5d, r4w
    shr       r3d, 16
    shr       r4d, 16
    lea       eax, [rax*4+r5]
    lea       r3d, [ r3*4+r4+1]
    cmp       eax, r3d
    cmovg     eax, r3d
    ; reverse to phminposuw order
    mov       r3d, eax
    and       eax, 7
    shr       r3d, 3
    shl       eax, 16
    or        eax, r3d
%endif
    add       r2d, 8<<16
    cmp        ax, r2w
    cmovg     eax, r2d

    mov       r2d, eax
    shr       r2d, 16
    shl       r2d, 6
    add        r1, 4*FDEC_STRIDE
    mova       m0, [pred_buf+r2+0x00]
    mova       m1, [pred_buf+r2+0x10]
    mova       m2, [pred_buf+r2+0x20]
    mova       m3, [pred_buf+r2+0x30]
    movq   [r1+FDEC_STRIDE*-4], m0
    movhps [r1+FDEC_STRIDE*-2], m0
    movq   [r1+FDEC_STRIDE*-3], m1
    movhps [r1+FDEC_STRIDE*-1], m1
    movq   [r1+FDEC_STRIDE* 0], m2
    movhps [r1+FDEC_STRIDE* 2], m2
    movq   [r1+FDEC_STRIDE* 1], m3
    movhps [r1+FDEC_STRIDE* 3], m3
    ADD       rsp, pad
    RET

ALIGN 16
.sa8d:
    %xdefine mret m0
    %xdefine fenc_buf fenc_buf+gprsize
    mova [r5+0x00], m0
    mova [r5+0x10], m1
    mova [r5+0x20], m2
    mova [r5+0x30], m3
    movddup     m4, m0
    movddup     m5, m1
    movddup     m6, m2
    movddup     m7, m3
    punpckhqdq  m0, m0
    punpckhqdq  m1, m1
    punpckhqdq  m2, m2
    punpckhqdq  m3, m3
    PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
    pmaddubsw   m0, m15
    pmaddubsw   m1, m15
    psubw       m0, [fenc_buf+0x00]
    psubw       m1, [fenc_buf+0x10]
    pmaddubsw   m2, m15
    pmaddubsw   m3, m15
    psubw       m2, [fenc_buf+0x20]
    psubw       m3, [fenc_buf+0x30]
    pmaddubsw   m4, m15
    pmaddubsw   m5, m15
    psubw       m4, [fenc_buf+0x40]
    psubw       m5, [fenc_buf+0x50]
    pmaddubsw   m6, m15
    pmaddubsw   m7, m15
    psubw       m6, [fenc_buf+0x60]
    psubw       m7, [fenc_buf+0x70]
    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
    paddw       m0, m1
    paddw       m0, m2
    paddw mret, m0, m3
    ret
%endif ; ARCH_X86_64
%endmacro ; INTRA8_X9

; in:  r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
; out: [tmp]=hadamard4, m0=satd
INIT_MMX mmx2
cglobal hadamard_ac_4x4
%ifdef HIGH_BIT_DEPTH
    mova      m0, [r0]
    mova      m1, [r0+r1]
    mova      m2, [r0+r1*2]
    mova      m3, [r0+r2]
%else ; !HIGH_BIT_DEPTH
    movh      m0, [r0]
    movh      m1, [r0+r1]
    movh      m2, [r0+r1*2]
    movh      m3, [r0+r2]
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
%endif ; HIGH_BIT_DEPTH
    HADAMARD4_2D 0, 1, 2, 3, 4
    mova [r3],    m0
    mova [r3+8],  m1
    mova [r3+16], m2
    mova [r3+24], m3
    ABSW      m0, m0, m4
    ABSW      m1, m1, m4
    pand      m0, m6
    ABSW      m2, m2, m4
    ABSW      m3, m3, m4
    paddw     m0, m1
    paddw     m2, m3
    paddw     m0, m2
    SAVE_MM_PERMUTATION
    ret

cglobal hadamard_ac_2x2max
    mova      m0, [r3+0x00]
    mova      m1, [r3+0x20]
    mova      m2, [r3+0x40]
    mova      m3, [r3+0x60]
    sub       r3, 8
    SUMSUB_BADC w, 0, 1, 2, 3, 4
    ABSW2 m0, m2, m0, m2, m4, m5
    ABSW2 m1, m3, m1, m3, m4, m5
    HADAMARD 0, max, 0, 2, 4, 5
    HADAMARD 0, max, 1, 3, 4, 5
%ifdef HIGH_BIT_DEPTH
    pmaddwd   m0, m7
    pmaddwd   m1, m7
    paddd     m6, m0
    paddd     m6, m1
%else ; !HIGH_BIT_DEPTH
    paddw     m7, m0
    paddw     m7, m1
%endif ; HIGH_BIT_DEPTH
    SAVE_MM_PERMUTATION
    ret

%macro AC_PREP 2
%ifdef HIGH_BIT_DEPTH
    pmaddwd %1, %2
%endif
%endmacro

%macro AC_PADD 3
%ifdef HIGH_BIT_DEPTH
    AC_PREP %2, %3
    paddd   %1, %2
%else
    paddw   %1, %2
%endif ; HIGH_BIT_DEPTH
%endmacro

cglobal hadamard_ac_8x8
    mova      m6, [mask_ac4]
%ifdef HIGH_BIT_DEPTH
    mova      m7, [pw_1]
%else
    pxor      m7, m7
%endif ; HIGH_BIT_DEPTH
    call hadamard_ac_4x4_mmx2
    add       r0, 4*SIZEOF_PIXEL
    add       r3, 32
    mova      m5, m0
    AC_PREP   m5, m7
    call hadamard_ac_4x4_mmx2
    lea       r0, [r0+4*r1]
    add       r3, 64
    AC_PADD   m5, m0, m7
    call hadamard_ac_4x4_mmx2
    sub       r0, 4*SIZEOF_PIXEL
    sub       r3, 32
    AC_PADD   m5, m0, m7
    call hadamard_ac_4x4_mmx2
    AC_PADD   m5, m0, m7
    sub       r3, 40
    mova [rsp+gprsize+8], m5 ; save satd
%ifdef HIGH_BIT_DEPTH
    pxor      m6, m6
%endif
%rep 3
    call hadamard_ac_2x2max_mmx2
%endrep
    mova      m0, [r3+0x00]
    mova      m1, [r3+0x20]
    mova      m2, [r3+0x40]
    mova      m3, [r3+0x60]
    SUMSUB_BADC w, 0, 1, 2, 3, 4
    HADAMARD 0, sumsub, 0, 2, 4, 5
    ABSW2 m1, m3, m1, m3, m4, m5
    ABSW2 m0, m2, m0, m2, m4, m5
    HADAMARD 0, max, 1, 3, 4, 5
%ifdef HIGH_BIT_DEPTH
    pand      m0, [mask_ac4]
    pmaddwd   m1, m7
    pmaddwd   m0, m7
    pmaddwd   m2, m7
    paddd     m6, m1
    paddd     m0, m2
    paddd     m6, m6
    paddd     m0, m6
    SWAP       0,  6
%else ; !HIGH_BIT_DEPTH
    pand      m6, m0
    paddw     m7, m1
    paddw     m6, m2
    paddw     m7, m7
    paddw     m6, m7
%endif ; HIGH_BIT_DEPTH
    mova [rsp+gprsize], m6 ; save sa8d
    SWAP       0,  6
    SAVE_MM_PERMUTATION
    ret

%macro HADAMARD_AC_WXH_SUM_MMX 2
    mova    m1, [rsp+1*mmsize]
%ifdef HIGH_BIT_DEPTH
%if %1*%2 >= 128
    paddd   m0, [rsp+2*mmsize]
    paddd   m1, [rsp+3*mmsize]
%endif
%if %1*%2 == 256
    mova    m2, [rsp+4*mmsize]
    paddd   m1, [rsp+5*mmsize]
    paddd   m2, [rsp+6*mmsize]
    mova    m3, m0
    paddd   m1, [rsp+7*mmsize]
    paddd   m0, m2
%endif
    psrld   m0, 1
    HADDD   m0, m2
    psrld   m1, 1
    HADDD   m1, m3
%else ; !HIGH_BIT_DEPTH
%if %1*%2 >= 128
    paddusw m0, [rsp+2*mmsize]
    paddusw m1, [rsp+3*mmsize]
%endif
%if %1*%2 == 256
    mova    m2, [rsp+4*mmsize]
    paddusw m1, [rsp+5*mmsize]
    paddusw m2, [rsp+6*mmsize]
    mova    m3, m0
    paddusw m1, [rsp+7*mmsize]
    pxor    m3, m2
    pand    m3, [pw_1]
    pavgw   m0, m2
    psubusw m0, m3
    HADDUW  m0, m2
%else
    psrlw   m0, 1
    HADDW   m0, m2
%endif
    psrlw   m1, 1
    HADDW   m1, m3
%endif ; HIGH_BIT_DEPTH
%endmacro

%macro HADAMARD_AC_WXH_MMX 2
cglobal pixel_hadamard_ac_%1x%2, 2,4
    %assign pad 16-gprsize-(stack_offset&15)
    %define ysub r1
    FIX_STRIDES r1
    sub  rsp, 16+128+pad
    lea  r2, [r1*3]
    lea  r3, [rsp+16]
    call hadamard_ac_8x8_mmx2
%if %2==16
    %define ysub r2
    lea  r0, [r0+r1*4]
    sub  rsp, 16
    call hadamard_ac_8x8_mmx2
%endif
%if %1==16
    neg  ysub
    sub  rsp, 16
    lea  r0, [r0+ysub*4+8*SIZEOF_PIXEL]
    neg  ysub
    call hadamard_ac_8x8_mmx2
%if %2==16
    lea  r0, [r0+r1*4]
    sub  rsp, 16
    call hadamard_ac_8x8_mmx2
%endif
%endif
    HADAMARD_AC_WXH_SUM_MMX %1, %2
    movd edx, m0
    movd eax, m1
    shr  edx, 1
%ifdef ARCH_X86_64
    shl  rdx, 32
    add  rax, rdx
%endif
    add  rsp, 128+%1*%2/4+pad
    RET
%endmacro ; HADAMARD_AC_WXH_MMX

HADAMARD_AC_WXH_MMX 16, 16
HADAMARD_AC_WXH_MMX  8, 16
HADAMARD_AC_WXH_MMX 16,  8
HADAMARD_AC_WXH_MMX  8,  8

%macro LOAD_INC_8x4W_SSE2 5
%ifdef HIGH_BIT_DEPTH
    movu      m%1, [r0]
    movu      m%2, [r0+r1]
    movu      m%3, [r0+r1*2]
    movu      m%4, [r0+r2]
%ifidn %1, 0
    lea       r0, [r0+r1*4]
%endif
%else ; !HIGH_BIT_DEPTH
    movh      m%1, [r0]
    movh      m%2, [r0+r1]
    movh      m%3, [r0+r1*2]
    movh      m%4, [r0+r2]
%ifidn %1, 0
    lea       r0, [r0+r1*4]
%endif
    punpcklbw m%1, m%5
    punpcklbw m%2, m%5
    punpcklbw m%3, m%5
    punpcklbw m%4, m%5
%endif ; HIGH_BIT_DEPTH
%endmacro

%macro LOAD_INC_8x4W_SSSE3 5
    LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
%ifidn %1, 0
    lea       r0, [r0+r1*4]
%endif
    HSUMSUB %1, %2, %3, %4, %5
%endmacro

%macro HADAMARD_AC_SSE2 0
; in:  r0=pix, r1=stride, r2=stride*3
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
cglobal hadamard_ac_8x8
%ifdef ARCH_X86_64
    %define spill0 m8
    %define spill1 m9
    %define spill2 m10
%else
    %define spill0 [rsp+gprsize]
    %define spill1 [rsp+gprsize+16]
    %define spill2 [rsp+gprsize+32]
%endif
%ifdef HIGH_BIT_DEPTH
    %define vertical 1
%elif cpuflag(ssse3)
    %define vertical 0
    ;LOAD_INC loads sumsubs
    mova      m7, [hmul_8p]
%else
    %define vertical 1
    ;LOAD_INC only unpacks to words
    pxor      m7, m7
%endif
    LOAD_INC_8x4W 0, 1, 2, 3, 7
%if vertical
    HADAMARD4_2D_SSE 0, 1, 2, 3, 4
%else
    HADAMARD4_V 0, 1, 2, 3, 4
%endif
    mova  spill0, m1
    SWAP 1, 7
    LOAD_INC_8x4W 4, 5, 6, 7, 1
%if vertical
    HADAMARD4_2D_SSE 4, 5, 6, 7, 1
%else
    HADAMARD4_V 4, 5, 6, 7, 1
    ; FIXME SWAP
    mova      m1, spill0
    mova      spill0, m6
    mova      spill1, m7
    HADAMARD 1, sumsub, 0, 1, 6, 7
    HADAMARD 1, sumsub, 2, 3, 6, 7
    mova      m6, spill0
    mova      m7, spill1
    mova      spill0, m1
    mova      spill1, m0
    HADAMARD 1, sumsub, 4, 5, 1, 0
    HADAMARD 1, sumsub, 6, 7, 1, 0
    mova      m0, spill1
%endif
    mova  spill1, m2
    mova  spill2, m3
    ABSW      m1, m0, m0
    ABSW      m2, m4, m4
    ABSW      m3, m5, m5
    paddw     m1, m2
    SUMSUB_BA w, 0, 4
%if vertical
    pand      m1, [mask_ac4]
%else
    pand      m1, [mask_ac4b]
%endif
    AC_PREP   m1, [pw_1]
    ABSW      m2, spill0
    AC_PADD   m1, m3, [pw_1]
    ABSW      m3, spill1
    AC_PADD   m1, m2, [pw_1]
    ABSW      m2, spill2
    AC_PADD   m1, m3, [pw_1]
    ABSW      m3, m6, m6
    AC_PADD   m1, m2, [pw_1]
    ABSW      m2, m7, m7
    AC_PADD   m1, m3, [pw_1]
    mova      m3, m7
    AC_PADD   m1, m2, [pw_1]
    mova      m2, m6
    psubw     m7, spill2
    paddw     m3, spill2
    mova  [rsp+gprsize+32], m1 ; save satd
    mova      m1, m5
    psubw     m6, spill1
    paddw     m2, spill1
    psubw     m5, spill0
    paddw     m1, spill0
    %assign %%x 2
%if vertical
    %assign %%x 4
%endif
    mova  spill1, m4
    HADAMARD %%x, amax, 3, 7, 4
    HADAMARD %%x, amax, 2, 6, 7, 4
    mova      m4, spill1
    HADAMARD %%x, amax, 1, 5, 6, 7
    HADAMARD %%x, sumsub, 0, 4, 5, 6
    AC_PREP   m2, [pw_1]
    AC_PADD   m2, m3, [pw_1]
    AC_PADD   m2, m1, [pw_1]
%ifdef HIGH_BIT_DEPTH
    paddd     m2, m2
%else
    paddw     m2, m2
%endif ; HIGH_BIT_DEPTH
    ABSW      m4, m4, m7
    pand      m0, [mask_ac8]
    ABSW      m0, m0, m7
    AC_PADD   m2, m4, [pw_1]
    AC_PADD   m2, m0, [pw_1]
    mova [rsp+gprsize+16], m2 ; save sa8d
    SWAP       0, 2
    SAVE_MM_PERMUTATION
    ret

HADAMARD_AC_WXH_SSE2 16, 16
HADAMARD_AC_WXH_SSE2  8, 16
HADAMARD_AC_WXH_SSE2 16,  8
HADAMARD_AC_WXH_SSE2  8,  8
%endmacro ; HADAMARD_AC_SSE2

%macro HADAMARD_AC_WXH_SUM_SSE2 2
    mova    m1, [rsp+2*mmsize]
%ifdef HIGH_BIT_DEPTH
%if %1*%2 >= 128
    paddd   m0, [rsp+3*mmsize]
    paddd   m1, [rsp+4*mmsize]
%endif
%if %1*%2 == 256
    paddd   m0, [rsp+5*mmsize]
    paddd   m1, [rsp+6*mmsize]
    paddd   m0, [rsp+7*mmsize]
    paddd   m1, [rsp+8*mmsize]
    psrld   m0, 1
%endif
    HADDD   m0, m2
    HADDD   m1, m3
%else ; !HIGH_BIT_DEPTH
%if %1*%2 >= 128
    paddusw m0, [rsp+3*mmsize]
    paddusw m1, [rsp+4*mmsize]
%endif
%if %1*%2 == 256
    paddusw m0, [rsp+5*mmsize]
    paddusw m1, [rsp+6*mmsize]
    paddusw m0, [rsp+7*mmsize]
    paddusw m1, [rsp+8*mmsize]
    psrlw   m0, 1
%endif
    HADDUW  m0, m2
    HADDW   m1, m3
%endif ; HIGH_BIT_DEPTH
%endmacro

; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
%macro HADAMARD_AC_WXH_SSE2 2
cglobal pixel_hadamard_ac_%1x%2, 2,3,11
    %assign pad 16-gprsize-(stack_offset&15)
    %define ysub r1
    FIX_STRIDES r1
    sub  rsp, 48+pad
    lea  r2, [r1*3]
    call hadamard_ac_8x8
%if %2==16
    %define ysub r2
    lea  r0, [r0+r1*4]
    sub  rsp, 32
    call hadamard_ac_8x8
%endif
%if %1==16
    neg  ysub
    sub  rsp, 32
    lea  r0, [r0+ysub*4+8*SIZEOF_PIXEL]
    neg  ysub
    call hadamard_ac_8x8
%if %2==16
    lea  r0, [r0+r1*4]
    sub  rsp, 32
    call hadamard_ac_8x8
%endif
%endif
    HADAMARD_AC_WXH_SUM_SSE2 %1, %2
    movd edx, m0
    movd eax, m1
    shr  edx, 2 - (%1*%2 >> 8)
    shr  eax, 1
%ifdef ARCH_X86_64
    shl  rdx, 32
    add  rax, rdx
%endif
    add  rsp, 16+%1*%2/2+pad
    RET
%endmacro ; HADAMARD_AC_WXH_SSE2

; instantiate satds

%ifndef ARCH_X86_64
cextern pixel_sa8d_8x8_internal_mmx2
INIT_MMX mmx2
SA8D
%endif

%define TRANS TRANS_SSE2
%define DIFFOP DIFF_UNPACK_SSE2
%define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
%define LOAD_SUMSUB_16P  LOAD_SUMSUB_16P_SSE2
%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
%define movdqu movups
%define punpcklqdq movlhps
INIT_XMM sse2
SA8D
SATDS_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INIT_MMX mmx2
INTRA_X3_MMX
%endif
INIT_XMM sse2
HADAMARD_AC_SSE2

%define DIFFOP DIFF_SUMSUB_SSSE3
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
%ifndef HIGH_BIT_DEPTH
%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
%define LOAD_SUMSUB_16P  LOAD_SUMSUB_16P_SSSE3
%endif
INIT_XMM ssse3
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_X9
INTRA8_X9
%endif
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
%ifndef HIGH_BIT_DEPTH
INIT_MMX ssse3
INTRA_X3_MMX
%endif

%define TRANS TRANS_SSE4
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
INIT_XMM sse4
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_X9
INTRA8_X9
%endif

INIT_XMM avx
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
INTRA_X9
INTRA8_X9
%endif
HADAMARD_AC_SSE2

%define TRANS TRANS_XOP
INIT_XMM xop
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
INTRA_X9
; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
%endif
HADAMARD_AC_SSE2

;=============================================================================
; SSIM
;=============================================================================

;-----------------------------------------------------------------------------
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
;                             const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
%macro SSIM_ITER 1
%ifdef HIGH_BIT_DEPTH
    movdqu    m5, [r0+(%1&1)*r1]
    movdqu    m6, [r2+(%1&1)*r3]
%else
    movq      m5, [r0+(%1&1)*r1]
    movq      m6, [r2+(%1&1)*r3]
    punpcklbw m5, m0
    punpcklbw m6, m0
%endif
%if %1==1
    lea       r0, [r0+r1*2]
    lea       r2, [r2+r3*2]
%endif
%if %1==0
    movdqa    m1, m5
    movdqa    m2, m6
%else
    paddw     m1, m5
    paddw     m2, m6
%endif
    pmaddwd   m7, m5, m6
    pmaddwd   m5, m5
    pmaddwd   m6, m6
%if %1==0
    SWAP       3,  5
    SWAP       4,  7
%else
    paddd     m3, m5
    paddd     m4, m7
%endif
    paddd     m3, m6
%endmacro

%macro SSIM 0
cglobal pixel_ssim_4x4x2_core, 4,4,8
    FIX_STRIDES r1, r3
    pxor      m0, m0
    SSIM_ITER 0
    SSIM_ITER 1
    SSIM_ITER 2
    SSIM_ITER 3
    ; PHADDW m1, m2
    ; PHADDD m3, m4
    movdqa    m7, [pw_1]
    pshufd    m5, m3, q2301
    pmaddwd   m1, m7
    pmaddwd   m2, m7
    pshufd    m6, m4, q2301
    packssdw  m1, m2
    paddd     m3, m5
    pshufd    m1, m1, q3120
    paddd     m4, m6
    pmaddwd   m1, m7
    punpckhdq m5, m3, m4
    punpckldq m3, m4

%ifdef UNIX64
    %define t0 r4
%else
    %define t0 rax
    mov t0, r4mp
%endif

    movq      [t0+ 0], m1
    movq      [t0+ 8], m3
    movhps    [t0+16], m1
    movq      [t0+24], m5
    RET

;-----------------------------------------------------------------------------
; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
cglobal pixel_ssim_end4, 3,3,7
    movdqa    m0, [r0+ 0]
    movdqa    m1, [r0+16]
    movdqa    m2, [r0+32]
    movdqa    m3, [r0+48]
    movdqa    m4, [r0+64]
    paddd     m0, [r1+ 0]
    paddd     m1, [r1+16]
    paddd     m2, [r1+32]
    paddd     m3, [r1+48]
    paddd     m4, [r1+64]
    paddd     m0, m1
    paddd     m1, m2
    paddd     m2, m3
    paddd     m3, m4
    movdqa    m5, [ssim_c1]
    movdqa    m6, [ssim_c2]
    TRANSPOSE4x4D  0, 1, 2, 3, 4

;   s1=m0, s2=m1, ss=m2, s12=m3
%if BIT_DEPTH == 10
    cvtdq2ps  m0, m0
    cvtdq2ps  m1, m1
    cvtdq2ps  m2, m2
    cvtdq2ps  m3, m3
    mulps     m2, [pf_64] ; ss*64
    mulps     m3, [pf_128] ; s12*128
    movdqa    m4, m1
    mulps     m4, m0      ; s1*s2
    mulps     m1, m1      ; s2*s2
    mulps     m0, m0      ; s1*s1
    addps     m4, m4      ; s1*s2*2
    addps     m0, m1      ; s1*s1 + s2*s2
    subps     m2, m0      ; vars
    subps     m3, m4      ; covar*2
    addps     m4, m5      ; s1*s2*2 + ssim_c1
    addps     m0, m5      ; s1*s1 + s2*s2 + ssim_c1
    addps     m2, m6      ; vars + ssim_c2
    addps     m3, m6      ; covar*2 + ssim_c2
%else
    pmaddwd   m4, m1, m0  ; s1*s2
    pslld     m1, 16
    por       m0, m1
    pmaddwd   m0, m0  ; s1*s1 + s2*s2
    pslld     m4, 1
    pslld     m3, 7
    pslld     m2, 6
    psubd     m3, m4  ; covar*2
    psubd     m2, m0  ; vars
    paddd     m0, m5
    paddd     m4, m5
    paddd     m3, m6
    paddd     m2, m6
    cvtdq2ps  m0, m0  ; (float)(s1*s1 + s2*s2 + ssim_c1)
    cvtdq2ps  m4, m4  ; (float)(s1*s2*2 + ssim_c1)
    cvtdq2ps  m3, m3  ; (float)(covar*2 + ssim_c2)
    cvtdq2ps  m2, m2  ; (float)(vars + ssim_c2)
%endif
    mulps     m4, m3
    mulps     m0, m2
    divps     m4, m0  ; ssim

    cmp       r2d, 4
    je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
    neg       r2
%ifdef PIC
    lea       r3, [mask_ff + 16]
    movdqu    m1, [r3 + r2*4]
%else
    movdqu    m1, [mask_ff + r2*4 + 16]
%endif
    pand      m4, m1
.skip:
    movhlps   m0, m4
    addps     m0, m4
    pshuflw   m4, m0, q0032
    addss     m0, m4
%ifndef ARCH_X86_64
    movd     r0m, m0
    fld     dword r0m
%endif
    RET
%endmacro ; SSIM

INIT_XMM sse2
SSIM
INIT_XMM avx
SSIM

;=============================================================================
; Successive Elimination ADS
;=============================================================================

%macro ADS_START 0
%ifdef WIN64
    movsxd  r5,  r5d
%endif
    mov     r0d, r5d
    lea     r6,  [r4+r5+15]
    and     r6,  ~15;
    shl     r2d,  1
%endmacro

%macro ADS_END 1 ; unroll_size
    add     r1, 8*%1
    add     r3, 8*%1
    add     r6, 4*%1
    sub     r0d, 4*%1
    jg .loop
    WIN64_RESTORE_XMM rsp
    jmp ads_mvs
%endmacro

;-----------------------------------------------------------------------------
; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
;                 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_ads4, 6,7
    movq    mm6, [r0]
    movq    mm4, [r0+8]
    pshufw  mm7, mm6, 0
    pshufw  mm6, mm6, q2222
    pshufw  mm5, mm4, 0
    pshufw  mm4, mm4, q2222
    ADS_START
.loop:
    movq    mm0, [r1]
    movq    mm1, [r1+16]
    psubw   mm0, mm7
    psubw   mm1, mm6
    ABSW    mm0, mm0, mm2
    ABSW    mm1, mm1, mm3
    movq    mm2, [r1+r2]
    movq    mm3, [r1+r2+16]
    psubw   mm2, mm5
    psubw   mm3, mm4
    paddw   mm0, mm1
    ABSW    mm2, mm2, mm1
    ABSW    mm3, mm3, mm1
    paddw   mm0, mm2
    paddw   mm0, mm3
    pshufw  mm1, r6m, 0
    paddusw mm0, [r3]
    psubusw mm1, mm0
    packsswb mm1, mm1
    movd    [r6], mm1
    ADS_END 1

cglobal pixel_ads2, 6,7
    movq    mm6, [r0]
    pshufw  mm5, r6m, 0
    pshufw  mm7, mm6, 0
    pshufw  mm6, mm6, q2222
    ADS_START
.loop:
    movq    mm0, [r1]
    movq    mm1, [r1+r2]
    psubw   mm0, mm7
    psubw   mm1, mm6
    ABSW    mm0, mm0, mm2
    ABSW    mm1, mm1, mm3
    paddw   mm0, mm1
    paddusw mm0, [r3]
    movq    mm4, mm5
    psubusw mm4, mm0
    packsswb mm4, mm4
    movd    [r6], mm4
    ADS_END 1

cglobal pixel_ads1, 6,7
    pshufw  mm7, [r0], 0
    pshufw  mm6, r6m, 0
    ADS_START
.loop:
    movq    mm0, [r1]
    movq    mm1, [r1+8]
    psubw   mm0, mm7
    psubw   mm1, mm7
    ABSW    mm0, mm0, mm2
    ABSW    mm1, mm1, mm3
    paddusw mm0, [r3]
    paddusw mm1, [r3+8]
    movq    mm4, mm6
    movq    mm5, mm6
    psubusw mm4, mm0
    psubusw mm5, mm1
    packsswb mm4, mm5
    movq    [r6], mm4
    ADS_END 2

%macro ADS_XMM 0
cglobal pixel_ads4, 6,7,12
    movdqa  xmm4, [r0]
    pshuflw xmm7, xmm4, 0
    pshuflw xmm6, xmm4, q2222
    pshufhw xmm5, xmm4, 0
    pshufhw xmm4, xmm4, q2222
    punpcklqdq xmm7, xmm7
    punpcklqdq xmm6, xmm6
    punpckhqdq xmm5, xmm5
    punpckhqdq xmm4, xmm4
%ifdef ARCH_X86_64
    pshuflw xmm8, r6m, 0
    punpcklqdq xmm8, xmm8
    ADS_START
    movdqu  xmm10, [r1]
    movdqu  xmm11, [r1+r2]
.loop:
    psubw   xmm0, xmm10, xmm7
    movdqu xmm10, [r1+16]
    psubw   xmm1, xmm10, xmm6
    ABSW    xmm0, xmm0, xmm2
    ABSW    xmm1, xmm1, xmm3
    psubw   xmm2, xmm11, xmm5
    movdqu xmm11, [r1+r2+16]
    paddw   xmm0, xmm1
    psubw   xmm3, xmm11, xmm4
    movdqu  xmm9, [r3]
    ABSW    xmm2, xmm2, xmm1
    ABSW    xmm3, xmm3, xmm1
    paddw   xmm0, xmm2
    paddw   xmm0, xmm3
    paddusw xmm0, xmm9
    psubusw xmm1, xmm8, xmm0
    packsswb xmm1, xmm1
    movq    [r6], xmm1
%else
    ADS_START
.loop:
    movdqu  xmm0, [r1]
    movdqu  xmm1, [r1+16]
    psubw   xmm0, xmm7
    psubw   xmm1, xmm6
    ABSW    xmm0, xmm0, xmm2
    ABSW    xmm1, xmm1, xmm3
    movdqu  xmm2, [r1+r2]
    movdqu  xmm3, [r1+r2+16]
    psubw   xmm2, xmm5
    psubw   xmm3, xmm4
    paddw   xmm0, xmm1
    ABSW    xmm2, xmm2, xmm1
    ABSW    xmm3, xmm3, xmm1
    paddw   xmm0, xmm2
    paddw   xmm0, xmm3
    movd    xmm1, r6m
    movdqu  xmm2, [r3]
    pshuflw xmm1, xmm1, 0
    punpcklqdq xmm1, xmm1
    paddusw xmm0, xmm2
    psubusw xmm1, xmm0
    packsswb xmm1, xmm1
    movq    [r6], xmm1
%endif ; ARCH
    ADS_END 2

cglobal pixel_ads2, 6,7,8
    movq    xmm6, [r0]
    movd    xmm5, r6m
    pshuflw xmm7, xmm6, 0
    pshuflw xmm6, xmm6, q2222
    pshuflw xmm5, xmm5, 0
    punpcklqdq xmm7, xmm7
    punpcklqdq xmm6, xmm6
    punpcklqdq xmm5, xmm5
    ADS_START
.loop:
    movdqu  xmm0, [r1]
    movdqu  xmm1, [r1+r2]
    psubw   xmm0, xmm7
    psubw   xmm1, xmm6
    movdqu  xmm4, [r3]
    ABSW    xmm0, xmm0, xmm2
    ABSW    xmm1, xmm1, xmm3
    paddw   xmm0, xmm1
    paddusw xmm0, xmm4
    psubusw xmm1, xmm5, xmm0
    packsswb xmm1, xmm1
    movq    [r6], xmm1
    ADS_END 2

cglobal pixel_ads1, 6,7,8
    movd    xmm7, [r0]
    movd    xmm6, r6m
    pshuflw xmm7, xmm7, 0
    pshuflw xmm6, xmm6, 0
    punpcklqdq xmm7, xmm7
    punpcklqdq xmm6, xmm6
    ADS_START
.loop:
    movdqu  xmm0, [r1]
    movdqu  xmm1, [r1+16]
    psubw   xmm0, xmm7
    psubw   xmm1, xmm7
    movdqu  xmm2, [r3]
    movdqu  xmm3, [r3+16]
    ABSW    xmm0, xmm0, xmm4
    ABSW    xmm1, xmm1, xmm5
    paddusw xmm0, xmm2
    paddusw xmm1, xmm3
    psubusw xmm4, xmm6, xmm0
    psubusw xmm5, xmm6, xmm1
    packsswb xmm4, xmm5
    movdqa  [r6], xmm4
    ADS_END 4
%endmacro

INIT_XMM sse2
ADS_XMM
INIT_XMM ssse3
ADS_XMM
INIT_XMM avx
ADS_XMM

; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
;     int nmv=0, i, j;
;     *(uint32_t*)(masks+width) = 0;
;     for( i=0; i<width; i+=8 )
;     {
;         uint64_t mask = *(uint64_t*)(masks+i);
;         if( !mask ) continue;
;         for( j=0; j<8; j++ )
;             if( mask & (255<<j*8) )
;                 mvs[nmv++] = i+j;
;     }
;     return nmv;
; }

%macro TEST 1
    mov     [r4+r0*2], r1w
    test    r2d, 0xff<<(%1*8)
    setne   r3b
    add     r0d, r3d
    inc     r1d
%endmacro

INIT_MMX
cglobal pixel_ads_mvs, 0,7,0
ads_mvs:
    lea     r6,  [r4+r5+15]
    and     r6,  ~15;
    ; mvs = r4
    ; masks = r6
    ; width = r5
    ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
    xor     r0d, r0d
    xor     r1d, r1d
    mov     [r6+r5], r0d
    jmp .loopi
ALIGN 16
.loopi0:
    add     r1d, 8
    cmp     r1d, r5d
    jge .end
.loopi:
    mov     r2,  [r6+r1]
%ifdef ARCH_X86_64
    test    r2,  r2
%else
    mov     r3,  r2
    add    r3d, [r6+r1+4]
%endif
    jz .loopi0
    xor     r3d, r3d
    TEST 0
    TEST 1
    TEST 2
    TEST 3
%ifdef ARCH_X86_64
    shr     r2,  32
%else
    mov     r2d, [r6+r1]
%endif
    TEST 0
    TEST 1
    TEST 2
    TEST 3
    cmp     r1d, r5d
    jl .loopi
.end:
    movifnidn eax, r0d
    RET
x264-snapshot-20120103-2245-stable/common/x86/pixel-32.asm0000644000175000001440000002521311700673342021656 0ustar  videolanusers;*****************************************************************************
;* pixel-32.asm: x86_32 pixel metrics
;*****************************************************************************
;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Laurent Aimar <fenrir@via.ecp.fr>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

cextern pw_ppmmppmm
cextern pw_pmpmpmpm

SECTION .text
INIT_MMX mmx2

%macro LOAD_DIFF_4x8P 1 ; dx
    LOAD_DIFF  m0, m7, none, [r0+%1],      [r2+%1]
    LOAD_DIFF  m1, m6, none, [r0+%1+r1],   [r2+%1+r3]
    LOAD_DIFF  m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
    LOAD_DIFF  m3, m6, none, [r0+%1+r4],   [r2+%1+r5]
    lea  r0, [r0+4*r1]
    lea  r2, [r2+4*r3]
    LOAD_DIFF  m4, m7, none, [r0+%1],      [r2+%1]
    LOAD_DIFF  m5, m6, none, [r0+%1+r1],   [r2+%1+r3]
    LOAD_DIFF  m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
    movq [spill], m5
    LOAD_DIFF  m7, m5, none, [r0+%1+r4],   [r2+%1+r5]
    movq m5, [spill]
%endmacro

%macro SUM4x8_MM 0
    movq [spill],   m6
    movq [spill+8], m7
    ABSW2    m0, m1, m0, m1, m6, m7
    ABSW2    m2, m3, m2, m3, m6, m7
    paddw    m0, m2
    paddw    m1, m3
    movq     m6, [spill]
    movq     m7, [spill+8]
    ABSW2    m4, m5, m4, m5, m2, m3
    ABSW2    m6, m7, m6, m7, m2, m3
    paddw    m4, m6
    paddw    m5, m7
    paddw    m0, m4
    paddw    m1, m5
    paddw    m0, m1
%endmacro

;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
    push   r0
    push   r2
    sub    esp, 0x74
%define args  esp+0x74
%define spill esp+0x60 ; +16
%define trans esp+0    ; +96
    LOAD_DIFF_4x8P 0
    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7

    movq   [spill], m1
    TRANSPOSE4x4W 4, 5, 6, 7, 1
    movq   [trans+0x00], m4
    movq   [trans+0x08], m5
    movq   [trans+0x10], m6
    movq   [trans+0x18], m7
    movq   m1, [spill]
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    movq   [trans+0x20], m0
    movq   [trans+0x28], m1
    movq   [trans+0x30], m2
    movq   [trans+0x38], m3

    mov    r0, [args+4]
    mov    r2, [args]
    LOAD_DIFF_4x8P 4
    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7

    movq   [spill], m7
    TRANSPOSE4x4W 0, 1, 2, 3, 7
    movq   [trans+0x40], m0
    movq   [trans+0x48], m1
    movq   [trans+0x50], m2
    movq   [trans+0x58], m3
    movq   m7, [spill]
    TRANSPOSE4x4W 4, 5, 6, 7, 1
    movq   m0, [trans+0x00]
    movq   m1, [trans+0x08]
    movq   m2, [trans+0x10]
    movq   m3, [trans+0x18]

    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
    SUM4x8_MM
    movq   [trans], m0

    movq   m0, [trans+0x20]
    movq   m1, [trans+0x28]
    movq   m2, [trans+0x30]
    movq   m3, [trans+0x38]
    movq   m4, [trans+0x40]
    movq   m5, [trans+0x48]
    movq   m6, [trans+0x50]
    movq   m7, [trans+0x58]

    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
    SUM4x8_MM

    pavgw  m0, [trans]
    add   esp, 0x7c
    ret
%undef args
%undef spill
%undef trans

%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
    pxor        %7, %7
    pshufw      %4, %1, q1032
    pshufw      %5, %2, q1032
    pshufw      %6, %3, q1032
    paddusw     %1, %4
    paddusw     %2, %5
    paddusw     %3, %6
    punpcklwd   %1, %7
    punpcklwd   %2, %7
    punpcklwd   %3, %7
    pshufw      %4, %1, q1032
    pshufw      %5, %2, q1032
    pshufw      %6, %3, q1032
    %8          %1, %4
    %8          %2, %5
    %8          %3, %6
%endmacro

%macro LOAD_4x8P 1 ; dx
    pxor        m7, m7
    movd        m6, [r0+%1+7*FENC_STRIDE]
    movd        m0, [r0+%1+0*FENC_STRIDE]
    movd        m1, [r0+%1+1*FENC_STRIDE]
    movd        m2, [r0+%1+2*FENC_STRIDE]
    movd        m3, [r0+%1+3*FENC_STRIDE]
    movd        m4, [r0+%1+4*FENC_STRIDE]
    movd        m5, [r0+%1+5*FENC_STRIDE]
    punpcklbw   m6, m7
    punpcklbw   m0, m7
    punpcklbw   m1, m7
    movq   [spill], m6
    punpcklbw   m2, m7
    punpcklbw   m3, m7
    movd        m6, [r0+%1+6*FENC_STRIDE]
    punpcklbw   m4, m7
    punpcklbw   m5, m7
    punpcklbw   m6, m7
    movq        m7, [spill]
%endmacro

%macro HSUMSUB2 4
    pshufw m4, %1, %3
    pshufw m5, %2, %3
    pmullw %1, %4
    pmullw m5, %4
    paddw  %1, m4
    paddw  %2, m5
%endmacro

;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
cglobal intra_sa8d_x3_8x8, 2,3
    SUB    esp, 0x94
%define edge  esp+0x70 ; +32
%define spill esp+0x60 ; +16
%define trans esp+0    ; +96
%define sum   esp+0    ; +32

    pxor      m7, m7
    movq      m0, [r1+7]
    movq      m2, [r1+16]
    movq      m1, m0
    movq      m3, m2
    punpcklbw m0, m7
    punpckhbw m1, m7
    punpcklbw m2, m7
    punpckhbw m3, m7
    movq      m6, [pw_ppmmppmm]
    HSUMSUB2  m0, m2, q1032, m6
    HSUMSUB2  m1, m3, q1032, m6
    movq      m6, [pw_pmpmpmpm]
    HSUMSUB2  m0, m2, q2301, m6
    HSUMSUB2  m1, m3, q2301, m6
    movq      m4, m0
    movq      m5, m2
    paddw     m0, m1
    paddw     m2, m3
    psubw     m4, m1
    psubw     m3, m5
    movq [edge+0], m0
    movq [edge+8], m4
    movq [edge+16], m2
    movq [edge+24], m3

    LOAD_4x8P 0
    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7

    movq   [spill], m0
    TRANSPOSE4x4W 4, 5, 6, 7, 0
    movq   [trans+0x00], m4
    movq   [trans+0x08], m5
    movq   [trans+0x10], m6
    movq   [trans+0x18], m7
    movq   m0, [spill]
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    movq   [trans+0x20], m0
    movq   [trans+0x28], m1
    movq   [trans+0x30], m2
    movq   [trans+0x38], m3

    LOAD_4x8P 4
    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7

    movq   [spill], m7
    TRANSPOSE4x4W 0, 1, 2, 3, 7
    movq   [trans+0x40], m0
    movq   [trans+0x48], m1
    movq   [trans+0x50], m2
    movq   [trans+0x58], m3
    movq   m7, [spill]
    TRANSPOSE4x4W 4, 5, 6, 7, 0
    movq   m0, [trans+0x00]
    movq   m1, [trans+0x08]
    movq   m2, [trans+0x10]
    movq   m3, [trans+0x18]

    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7

    movq [spill+0], m0
    movq [spill+8], m1
    ABSW2    m2, m3, m2, m3, m0, m1
    ABSW2    m4, m5, m4, m5, m0, m1
    paddw    m2, m4
    paddw    m3, m5
    ABSW2    m6, m7, m6, m7, m4, m5
    movq     m0, [spill+0]
    movq     m1, [spill+8]
    paddw    m2, m6
    paddw    m3, m7
    paddw    m2, m3
    ABSW     m1, m1, m4
    paddw    m2, m1 ; 7x4 sum
    movq     m7, m0
    movq     m1, [edge+8] ; left bottom
    psllw    m1, 3
    psubw    m7, m1
    ABSW2    m0, m7, m0, m7, m5, m3
    paddw    m0, m2
    paddw    m7, m2
    movq [sum+0], m0 ; dc
    movq [sum+8], m7 ; left

    movq   m0, [trans+0x20]
    movq   m1, [trans+0x28]
    movq   m2, [trans+0x30]
    movq   m3, [trans+0x38]
    movq   m4, [trans+0x40]
    movq   m5, [trans+0x48]
    movq   m6, [trans+0x50]
    movq   m7, [trans+0x58]

    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7

    movd   [sum+0x10], m0
    movd   [sum+0x12], m1
    movd   [sum+0x14], m2
    movd   [sum+0x16], m3
    movd   [sum+0x18], m4
    movd   [sum+0x1a], m5
    movd   [sum+0x1c], m6
    movd   [sum+0x1e], m7

    movq [spill],   m0
    movq [spill+8], m1
    ABSW2    m2, m3, m2, m3, m0, m1
    ABSW2    m4, m5, m4, m5, m0, m1
    paddw    m2, m4
    paddw    m3, m5
    paddw    m2, m3
    movq     m0, [spill]
    movq     m1, [spill+8]
    ABSW2    m6, m7, m6, m7, m4, m5
    ABSW     m1, m1, m3
    paddw    m2, m7
    paddw    m1, m6
    paddw    m2, m1 ; 7x4 sum
    movq     m1, m0

    movq     m7, [edge+0]
    psllw    m7, 3   ; left top

    mov      r2, [edge+0]
    add      r2, [edge+16]
    lea      r2, [4*r2+32]
    and      r2, 0xffc0
    movd     m6, r2 ; dc

    psubw    m1, m7
    psubw    m0, m6
    ABSW2    m0, m1, m0, m1, m5, m6
    movq     m3, [sum+0] ; dc
    paddw    m0, m2
    paddw    m1, m2
    movq     m2, m0
    paddw    m0, m3
    paddw    m1, [sum+8] ; h
    psrlq    m2, 16
    paddw    m2, m3

    movq     m3, [edge+16] ; top left
    movq     m4, [edge+24] ; top right
    psllw    m3, 3
    psllw    m4, 3
    psubw    m3, [sum+16]
    psubw    m4, [sum+24]
    ABSW2    m3, m4, m3, m4, m5, m6
    paddw    m2, m3
    paddw    m2, m4 ; v

    SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
    mov      r2, r2m
    pxor      m7, m7
    punpckldq m2, m1
    pavgw     m0, m7
    pavgw     m2, m7
    movd  [r2+8], m0 ; dc
    movq  [r2+0], m2 ; v, h
    ADD     esp, 0x94
    RET
%undef edge
%undef spill
%undef trans
%undef sum



;-----------------------------------------------------------------------------
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
;                             const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal pixel_ssim_4x4x2_core, 0,5
    mov       r1, r1m
    mov       r3, r3m
    mov       r4, 4
    pxor      m0, m0
.loop:
    mov       r0, r0m
    mov       r2, r2m
    add       r0, r4
    add       r2, r4
    pxor      m1, m1
    pxor      m2, m2
    pxor      m3, m3
    pxor      m4, m4
%rep 4
    movd      m5, [r0]
    movd      m6, [r2]
    punpcklbw m5, m0
    punpcklbw m6, m0
    paddw     m1, m5
    paddw     m2, m6
    movq      m7, m5
    pmaddwd   m5, m5
    pmaddwd   m7, m6
    pmaddwd   m6, m6
    paddd     m3, m5
    paddd     m4, m7
    paddd     m3, m6
    add       r0, r1
    add       r2, r3
%endrep
    mov       r0, r4m
    lea       r0, [r0+r4*4]
    pshufw    m5, m1, q0032
    pshufw    m6, m2, q0032
    paddusw   m1, m5
    paddusw   m2, m6
    punpcklwd m1, m2
    pshufw    m2, m1, q0032
    pshufw    m5, m3, q0032
    pshufw    m6, m4, q0032
    paddusw   m1, m2
    paddd     m3, m5
    paddd     m4, m6
    punpcklwd m1, m0
    punpckldq m3, m4
    movq  [r0+0], m1
    movq  [r0+8], m3
    sub       r4, 4
    jge .loop
    emms
    RET

x264-snapshot-20120103-2245-stable/common/x86/mc.h0000644000175000001440000000254411700673342020363 0ustar  videolanusers/*****************************************************************************
 * mc.h: x86 motion compensation
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_I386_MC_H
#define X264_I386_MC_H

void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf );

#endif
x264-snapshot-20120103-2245-stable/common/x86/mc-c.c0000644000175000001440000007332011700673342020576 0ustar  videolanusers/*****************************************************************************
 * mc-c.c: x86 motion compensation
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include "common/common.h"
#include "mc.h"

#define DECL_SUF( func, args )\
    void func##_mmx2 args;\
    void func##_sse2 args;\
    void func##_ssse3 args;

DECL_SUF( x264_pixel_avg_16x16, ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_8x16,  ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_8x8,   ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_8x4,   ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x16,  ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x8,   ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x4,   ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x2,   ( pixel *, int, pixel *, int, pixel *, int, int ))

#define MC_WEIGHT(w,type) \
    void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );

#define MC_WEIGHT_OFFSET(w,type) \
    void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
    void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
    MC_WEIGHT(w,type)

MC_WEIGHT_OFFSET( 4, mmx2 )
MC_WEIGHT_OFFSET( 8, mmx2 )
MC_WEIGHT_OFFSET( 12, mmx2 )
MC_WEIGHT_OFFSET( 16, mmx2 )
MC_WEIGHT_OFFSET( 20, mmx2 )
MC_WEIGHT_OFFSET( 12, sse2 )
MC_WEIGHT_OFFSET( 16, sse2 )
MC_WEIGHT_OFFSET( 20, sse2 )
#if HIGH_BIT_DEPTH
MC_WEIGHT_OFFSET( 8, sse2 )
#endif
MC_WEIGHT( 8, sse2  )
MC_WEIGHT( 4, ssse3 )
MC_WEIGHT( 8, ssse3 )
MC_WEIGHT( 12, ssse3 )
MC_WEIGHT( 16, ssse3 )
MC_WEIGHT( 20, ssse3 )
MC_WEIGHT( 4, avx )
MC_WEIGHT( 8, avx )
MC_WEIGHT( 12, avx )
MC_WEIGHT( 16, avx )
MC_WEIGHT( 20, avx )
#undef MC_OFFSET
#undef MC_WEIGHT

void x264_mc_copy_w4_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w8_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
void x264_prefetch_fenc_420_mmx2( pixel *, int, pixel *, int, int );
void x264_prefetch_fenc_422_mmx2( pixel *, int, pixel *, int, int );
void x264_prefetch_ref_mmx2( pixel *, int, int );
void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
                                             pixel *srcu, int i_srcu,
                                             pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
                                           pixel *srcu, int i_srcu,
                                           pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_core_avx( pixel *dst, int i_dst,
                                           pixel *srcu, int i_srcu,
                                           pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
                                   pixel *srcu, int i_srcu,
                                   pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_deinterleave_mmx( pixel *dstu, int i_dstu,
                                       pixel *dstv, int i_dstv,
                                       pixel *src, int i_src, int w, int h );
void x264_plane_copy_deinterleave_sse2( pixel *dstu, int i_dstu,
                                        pixel *dstv, int i_dstv,
                                        pixel *src, int i_src, int w, int h );
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
                                         uint8_t *dstv, int i_dstv,
                                         uint8_t *src, int i_src, int w, int h );
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
                                         uint16_t *dstv, int i_dstv,
                                         uint16_t *src, int i_src, int w, int h );
void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height );
void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height );
void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height );
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height );
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
void x264_memzero_aligned_mmx( void * dst, int n );
void x264_memzero_aligned_sse2( void * dst, int n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, int stride );
void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                     uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );

#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
                           pixel *src, int i_src,\
                           int dx, int dy, int i_width, int i_height );
MC_CHROMA(mmx2)
MC_CHROMA(sse2)
MC_CHROMA(sse2_misalign)
MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
MC_CHROMA(avx)
MC_CHROMA(avx_cache64)

#define LOWRES(cpu)\
void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
                                        int src_stride, int dst_stride, int width, int height );
LOWRES(mmx2)
LOWRES(cache32_mmx2)
LOWRES(sse2)
LOWRES(ssse3)

#define PIXEL_AVG_W(width,cpu)\
void x264_pixel_avg2_w##width##_##cpu( pixel *, int, pixel *, int, pixel *, int );
/* This declares some functions that don't exist, but that isn't a problem. */
#define PIXEL_AVG_WALL(cpu)\
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);

PIXEL_AVG_WALL(mmx2)
PIXEL_AVG_WALL(cache32_mmx2)
PIXEL_AVG_WALL(cache64_mmx2)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
PIXEL_AVG_WALL(cache64_ssse3)

#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, int, pixel *, int, pixel *, int ) =\
{\
    NULL,\
    x264_pixel_avg2_w4_##name1,\
    x264_pixel_avg2_w8_##name2,\
    x264_pixel_avg2_w12_##name3,\
    x264_pixel_avg2_w16_##name4,\
    x264_pixel_avg2_w20_##name5,\
};

#if HIGH_BIT_DEPTH
/* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */
#define x264_pixel_avg2_w12_mmx2       x264_pixel_avg2_w10_mmx2
#define x264_pixel_avg2_w20_mmx2       x264_pixel_avg2_w18_mmx2
#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w10_sse2
#define x264_pixel_avg2_w20_sse2         x264_pixel_avg2_w18_sse2
#else
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
#define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
#define x264_pixel_avg2_w12_sse3         x264_pixel_avg2_w16_sse3
#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w16_sse2
#endif // HIGH_BIT_DEPTH

PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
#if HIGH_BIT_DEPTH
PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
#else // !HIGH_BIT_DEPTH
#if ARCH_X86
PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
#endif
PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
#endif // HIGH_BIT_DEPTH

#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int, int ) =\
{\
    NULL,\
    x264_mc_copy_w4_##name1,\
    x264_mc_copy_w8_##name2,\
    NULL,\
    x264_mc_copy_w16_##name3,\
};

MC_COPY_WTAB(mmx,mmx,mmx,mmx)
#if HIGH_BIT_DEPTH
MC_COPY_WTAB(sse2,mmx,sse2,sse2)
#else
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
#endif

#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
    static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
{\
    x264_mc_##function##_w4_##name1,\
    x264_mc_##function##_w4_##name1,\
    x264_mc_##function##_w8_##name2,\
    x264_mc_##function##_w##w12version##_##instr,\
    x264_mc_##function##_w16_##instr,\
    x264_mc_##function##_w20_##instr,\
};

#if HIGH_BIT_DEPTH
MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,12)
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,sse2,16)
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,sse2,16)

static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
{
    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
            w->weightfn = h->mc.offsetsub;
        else
            w->weightfn = h->mc.offsetadd;
        for( int i = 0; i < 8; i++ )
            w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
        return;
    }
    w->weightfn = h->mc.weight;
    int den1 = 1<<w->i_denom;
    int den2 = w->i_scale<<1;
    int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
    for( int i = 0; i < 8; i++ )
    {
        w->cachea[i] = den1;
        w->cacheb[i] = i&1 ? den3 : den2;
    }
}
#else
MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,16)
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)

static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
{
    int i;
    int16_t den1;

    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
            w->weightfn = h->mc.offsetsub;
        else
            w->weightfn = h->mc.offsetadd;
        memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
        return;
    }
    w->weightfn = h->mc.weight;
    den1 = 1 << (w->i_denom - 1) | w->i_offset << w->i_denom;
    for( i = 0; i < 8; i++ )
    {
        w->cachea[i] = w->i_scale;
        w->cacheb[i] = den1;
    }
}

static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
{
    int i, den1;
    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
            w->weightfn = h->mc.offsetsub;
        else
            w->weightfn = h->mc.offsetadd;

        memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
        return;
    }
    w->weightfn = h->mc.weight;
    den1 = w->i_scale << (8 - w->i_denom);
    for( i = 0; i < 8; i++ )
    {
        w->cachea[i] = den1;
        w->cacheb[i] = w->i_offset;
    }
}
#endif // !HIGH_BIT_DEPTH

static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};

#define MC_LUMA(name,instr1,instr2)\
static void mc_luma_##name( pixel *dst,    int i_dst_stride,\
                  pixel *src[4], int i_src_stride,\
                  int mvx, int mvy,\
                  int i_width, int i_height, const x264_weight_t *weight )\
{\
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
    if( qpel_idx & 5 ) /* qpel interpolation needed */\
    {\
        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
        x264_pixel_avg_wtab_##instr1[i_width>>2](\
                dst, i_dst_stride, src1, i_src_stride,\
                src2, i_height );\
        if( weight->weightfn )\
            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
    }\
    else if( weight->weightfn )\
        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
    else\
        x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
}

MC_LUMA(mmx2,mmx2,mmx)
MC_LUMA(sse2,sse2,sse2)
#if !HIGH_BIT_DEPTH
#if ARCH_X86
MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
#endif
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
#endif // !HIGH_BIT_DEPTH

#define GET_REF(name)\
static pixel *get_ref_##name( pixel *dst,   int *i_dst_stride,\
                         pixel *src[4], int i_src_stride,\
                         int mvx, int mvy,\
                         int i_width, int i_height, const x264_weight_t *weight )\
{\
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
    if( qpel_idx & 5 ) /* qpel interpolation needed */\
    {\
        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
        x264_pixel_avg_wtab_##name[i_width>>2](\
                dst, *i_dst_stride, src1, i_src_stride,\
                src2, i_height );\
        if( weight->weightfn )\
            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\
        return dst;\
    }\
    else if( weight->weightfn )\
    {\
        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
        return dst;\
    }\
    else\
    {\
        *i_dst_stride = i_src_stride;\
        return src1;\
    }\
}

GET_REF(mmx2)
GET_REF(sse2)
#if !HIGH_BIT_DEPTH
#if ARCH_X86
GET_REF(cache32_mmx2)
GET_REF(cache64_mmx2)
#endif
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
#endif // !HIGH_BIT_DEPTH

#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, int stride, int width);\
void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, int width );\
void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, int width );\
static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
                             int stride, int width, int height, int16_t *buf )\
{\
    int realign = (intptr_t)src & (align-1);\
    src -= realign;\
    dstv -= realign;\
    dstc -= realign;\
    dsth -= realign;\
    width += realign;\
    while( height-- )\
    {\
        x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
        x264_hpel_filter_c_##cpuc( dstc, buf+8, width );\
        x264_hpel_filter_h_##cpuh( dsth, src, width );\
        dsth += stride;\
        dstv += stride;\
        dstc += stride;\
        src  += stride;\
    }\
    x264_sfence();\
}

HPEL(8, mmx2, mmx2, mmx2, mmx2)
#if HIGH_BIT_DEPTH
HPEL(16, sse2, sse2, sse2, sse2)
#else // !HIGH_BIT_DEPTH
HPEL(16, sse2_amd, mmx2, mmx2, sse2)
#if ARCH_X86_64
void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
#else
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, ssse3, ssse3, ssse3)
HPEL(16, avx, avx, avx, ssse3)
#endif
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH

static void x264_plane_copy_mmx2( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h )
{
    int c_w = 16/sizeof(pixel) - 1;
    if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
    } else if( !(w&c_w) ) {
        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h );
    } else if( i_src > 0 ) {
        // have to use plain memcpy on the last line (in memory order) to avoid overreading src
        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
        memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) );
    } else {
        memcpy( dst, src, w*sizeof(pixel) );
        x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
    }
}

#define PLANE_INTERLEAVE(cpu) \
static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\
                                              pixel *srcu, int i_srcu,\
                                              pixel *srcv, int i_srcv, int w, int h )\
{\
    if( !(w&15) ) {\
        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
    } else if( w < 16 || (i_srcu ^ i_srcv) ) {\
        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
    } else if( i_srcu > 0 ) {\
        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\
        x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\
    } else {\
        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
        x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\
    }\
}

PLANE_INTERLEAVE(mmx2)
PLANE_INTERLEAVE(sse2)
#if HIGH_BIT_DEPTH
PLANE_INTERLEAVE(avx)
#endif

void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
    if( !(cpu&X264_CPU_MMX) )
        return;

    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;

    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;

    pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
    pf->memcpy_aligned  = x264_memcpy_aligned_mmx;
    pf->memzero_aligned = x264_memzero_aligned_mmx;
    pf->integral_init4v = x264_integral_init4v_mmx;
    pf->integral_init8v = x264_integral_init8v_mmx;

    if( !(cpu&X264_CPU_MMX2) )
        return;

    pf->prefetch_fenc_420 = x264_prefetch_fenc_420_mmx2;
    pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
    pf->prefetch_ref  = x264_prefetch_ref_mmx2;

    pf->plane_copy = x264_plane_copy_mmx2;
    pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
    pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;

    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmx2;
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmx2;
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_mmx2;
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_mmx2;
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_mmx2;
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_mmx2;
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_mmx2;
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_mmx2;

    pf->mc_luma = mc_luma_mmx2;
    pf->get_ref = get_ref_mmx2;
    pf->mc_chroma = x264_mc_chroma_mmx2;
    pf->hpel_filter = x264_hpel_filter_mmx2;
    pf->weight = x264_mc_weight_wtab_mmx2;
    pf->weight_cache = x264_weight_cache_mmx2;
    pf->offsetadd = x264_mc_offsetadd_wtab_mmx2;
    pf->offsetsub = x264_mc_offsetsub_wtab_mmx2;

    pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;

#if HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
    if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
#endif

    if( !(cpu&X264_CPU_SSE2) )
        return;

    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;

    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;

    pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;

    if( cpu&X264_CPU_SSE2_IS_FAST )
    {
        pf->get_ref = get_ref_sse2;
        pf->mc_luma = mc_luma_sse2;
        pf->hpel_filter = x264_hpel_filter_sse2;
    }

    pf->memcpy_aligned  = x264_memcpy_aligned_sse2;
    pf->memzero_aligned = x264_memzero_aligned_sse2;
    pf->integral_init4v = x264_integral_init4v_sse2;
    pf->integral_init8v = x264_integral_init8v_sse2;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
    pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;

    if( cpu&X264_CPU_SSE2_IS_SLOW )
        return;

    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_sse2;
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_sse2;
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_sse2;
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_sse2;
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_sse2;
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_sse2;
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_sse2;

    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
    pf->weight = x264_mc_weight_wtab_sse2;

    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_sse2;

    if( !(cpu&X264_CPU_SSSE3) )
        return;

    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;

    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
        pf->integral_init4v = x264_integral_init4v_ssse3;

    if( !(cpu&X264_CPU_AVX) )
        return;

    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
    pf->plane_copy_interleave        = x264_plane_copy_interleave_avx;
    pf->plane_copy_deinterleave      = x264_plane_copy_deinterleave_avx;
    pf->store_interleave_chroma      = x264_store_interleave_chroma_avx;

    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_avx;
#else // !HIGH_BIT_DEPTH

#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
    if( cpu&X264_CPU_CACHELINE_32 )
    {
        pf->mc_luma = mc_luma_cache32_mmx2;
        pf->get_ref = get_ref_cache32_mmx2;
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
    }
    else if( cpu&X264_CPU_CACHELINE_64 )
    {
        pf->mc_luma = mc_luma_cache64_mmx2;
        pf->get_ref = get_ref_cache64_mmx2;
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
    }
#endif

    if( !(cpu&X264_CPU_SSE2) )
        return;

    pf->memcpy_aligned = x264_memcpy_aligned_sse2;
    pf->memzero_aligned = x264_memzero_aligned_sse2;
    pf->integral_init4v = x264_integral_init4v_sse2;
    pf->integral_init8v = x264_integral_init8v_sse2;
    pf->hpel_filter = x264_hpel_filter_sse2_amd;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;

    if( cpu&X264_CPU_SSE2_IS_SLOW )
        return;

    pf->weight = x264_mc_weight_wtab_sse2;
    if( !(cpu&X264_CPU_SLOW_ATOM) )
    {
        pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
        pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
    }

    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
    pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
    pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
    pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
    pf->hpel_filter = x264_hpel_filter_sse2;
    if( cpu&X264_CPU_SSE_MISALIGN )
        pf->hpel_filter = x264_hpel_filter_sse2_misalign;
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_sse2;

    if( cpu&X264_CPU_SSE2_IS_FAST )
    {
        pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
        pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
        pf->mc_luma = mc_luma_sse2;
        pf->get_ref = get_ref_sse2;
        if( cpu&X264_CPU_CACHELINE_64 )
        {
            pf->mc_luma = mc_luma_cache64_sse2;
            pf->get_ref = get_ref_cache64_sse2;
        }
        if( cpu&X264_CPU_SSE_MISALIGN )
        {
            pf->get_ref = get_ref_sse2_misalign;
            if( !(cpu&X264_CPU_STACK_MOD4) )
                pf->mc_chroma = x264_mc_chroma_sse2_misalign;
        }
    }

    if( !(cpu&X264_CPU_SSSE3) )
        return;

    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_ssse3;
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_ssse3;
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_ssse3;
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_ssse3;
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_ssse3;
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_ssse3;
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;

    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;

    pf->hpel_filter = x264_hpel_filter_ssse3;
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_ssse3;

    if( cpu&X264_CPU_CACHELINE_64 )
    {
        if( !(cpu&X264_CPU_STACK_MOD4) )
            pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
        pf->mc_luma = mc_luma_cache64_ssse3;
        pf->get_ref = get_ref_cache64_ssse3;

        /* ssse3 weight is slower on Nehalem, so only assign here. */
        pf->weight_cache = x264_weight_cache_ssse3;
        pf->weight = x264_mc_weight_wtab_ssse3;
    }

    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
        pf->integral_init4v = x264_integral_init4v_ssse3;

    if( !(cpu&X264_CPU_SSE4) )
        return;

    pf->integral_init4h = x264_integral_init4h_sse4;
    pf->integral_init8h = x264_integral_init8h_sse4;

    if( !(cpu&X264_CPU_AVX) )
        return;

    pf->integral_init8h = x264_integral_init8h_avx;
    pf->hpel_filter = x264_hpel_filter_avx;
    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_avx;
#endif // HIGH_BIT_DEPTH

    if( !(cpu&X264_CPU_AVX) )
        return;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;

    if( !(cpu&X264_CPU_FMA4) )
        return;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
}
x264-snapshot-20120103-2245-stable/common/x86/mc-a2.asm0000644000175000001440000012524311700673342021216 0ustar  videolanusers;*****************************************************************************
;* mc-a2.asm: x86 motion compensation
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*          Holger Lubitz <holger@lubitz.org>
;*          Mathieu Monnier <manao@melix.net>
;*          Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA

filt_mul20: times 16 db 20
filt_mul15: times 8 db 1, -5
filt_mul51: times 8 db -5, 1
hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15

pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
pf_inv256: times 8 dd 0.00390625

pad10: times 8 dw    10*PIXEL_MAX
pad20: times 8 dw    20*PIXEL_MAX
pad30: times 8 dw    30*PIXEL_MAX
depad: times 4 dd 32*20*PIXEL_MAX + 512

tap1: times 4 dw  1, -5
tap2: times 4 dw 20, 20
tap3: times 4 dw -5,  1

SECTION .text

cextern pb_0
cextern pw_1
cextern pw_16
cextern pw_32
cextern pw_00ff
cextern pw_3fff
cextern pw_pixel_max
cextern pd_ffff

%macro LOAD_ADD 4
    movh       %4, %3
    movh       %1, %2
    punpcklbw  %4, m0
    punpcklbw  %1, m0
    paddw      %1, %4
%endmacro

%macro LOAD_ADD_2 6
    mova       %5, %3
    mova       %1, %4
    punpckhbw  %6, %5, m0
    punpcklbw  %5, m0
    punpckhbw  %2, %1, m0
    punpcklbw  %1, m0
    paddw      %1, %5
    paddw      %2, %6
%endmacro

%macro FILT_V2 6
    psubw  %1, %2  ; a-b
    psubw  %4, %5
    psubw  %2, %3  ; b-c
    psubw  %5, %6
    psllw  %2, 2
    psllw  %5, 2
    psubw  %1, %2  ; a-5*b+4*c
    psllw  %3, 4
    psubw  %4, %5
    psllw  %6, 4
    paddw  %1, %3  ; a-5*b+20*c
    paddw  %4, %6
%endmacro

%macro FILT_H 3
    psubw  %1, %2  ; a-b
    psraw  %1, 2   ; (a-b)/4
    psubw  %1, %2  ; (a-b)/4-b
    paddw  %1, %3  ; (a-b)/4-b+c
    psraw  %1, 2   ; ((a-b)/4-b+c)/4
    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
%endmacro

%macro FILT_H2 6
    psubw  %1, %2
    psubw  %4, %5
    psraw  %1, 2
    psraw  %4, 2
    psubw  %1, %2
    psubw  %4, %5
    paddw  %1, %3
    paddw  %4, %6
    psraw  %1, 2
    psraw  %4, 2
    paddw  %1, %3
    paddw  %4, %6
%endmacro

%macro FILT_PACK 4-6 b
    paddw      %1, %4
    paddw      %2, %4
%if %0 == 6
    psubusw    %1, %6
    psubusw    %2, %6
    psrlw      %1, %3
    psrlw      %2, %3
%else
    psraw      %1, %3
    psraw      %2, %3
%endif
%ifnidn w, %5
    packuswb %1, %2
%endif
%endmacro

;The hpel_filter routines use non-temporal writes for output.
;The following defines may be uncommented for testing.
;Doing the hpel_filter temporal may be a win if the last level cache
;is big enough (preliminary benching suggests on the order of 4* framesize).

;%define movntq movq
;%define movntps movaps
;%define sfence

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
%macro HPEL_FILTER 0
cglobal hpel_filter_v, 5,6,11
    FIX_STRIDES r3d, r4d
%ifdef WIN64
    movsxd     r4, r4d
%endif
    lea        r5, [r1+r3]
    sub        r1, r3
    sub        r1, r3
%if num_mmregs > 8
    mova       m8, [pad10]
    mova       m9, [pad20]
    mova      m10, [pad30]
    %define s10 m8
    %define s20 m9
    %define s30 m10
%else
    %define s10 [pad10]
    %define s20 [pad20]
    %define s30 [pad30]
%endif
    add        r0, r4
    lea        r2, [r2+r4]
    neg        r4
    mova       m7, [pw_pixel_max]
    pxor       m0, m0
.loop:
    mova       m1, [r1]
    mova       m2, [r1+r3]
    mova       m3, [r1+r3*2]
    mova       m4, [r1+mmsize]
    mova       m5, [r1+r3+mmsize]
    mova       m6, [r1+r3*2+mmsize]
    paddw      m1, [r5+r3*2]
    paddw      m2, [r5+r3]
    paddw      m3, [r5]
    paddw      m4, [r5+r3*2+mmsize]
    paddw      m5, [r5+r3+mmsize]
    paddw      m6, [r5+mmsize]
    add        r1, 2*mmsize
    add        r5, 2*mmsize
    FILT_V2    m1, m2, m3, m4, m5, m6
    mova       m6, [pw_16]
    psubw      m1, s20
    psubw      m4, s20
    mova      [r2+r4], m1
    mova      [r2+r4+mmsize], m4
    paddw      m1, s30
    paddw      m4, s30
    FILT_PACK  m1, m4, 5, m6, w, s10
    CLIPW      m1, m0, m7
    CLIPW      m4, m0, m7
    mova      [r0+r4], m1
    mova      [r0+r4+mmsize], m4
    add        r4, 2*mmsize
    jl .loop
    REP_RET

;-----------------------------------------------------------------------------
; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,10
    add        r2, r2
    add        r0, r2
    lea        r1, [r1+r2]
    neg        r2
    mova       m0, [tap1]
    mova       m7, [tap3]
%if num_mmregs > 8
    mova       m8, [tap2]
    mova       m9, [depad]
    %define s1 m8
    %define s2 m9
%else
    %define s1 [tap2]
    %define s2 [depad]
%endif
.loop:
    movu       m1, [r1+r2-4]
    movu       m2, [r1+r2-2]
    mova       m3, [r1+r2+0]
    movu       m4, [r1+r2+2]
    movu       m5, [r1+r2+4]
    movu       m6, [r1+r2+6]
    pmaddwd    m1, m0
    pmaddwd    m2, m0
    pmaddwd    m3, s1
    pmaddwd    m4, s1
    pmaddwd    m5, m7
    pmaddwd    m6, m7
    paddd      m1, s2
    paddd      m2, s2
    paddd      m3, m5
    paddd      m4, m6
    paddd      m1, m3
    paddd      m2, m4
    psrad      m1, 10
    psrad      m2, 10
    pslld      m2, 16
    pand       m1, [pd_0f]
    por        m1, m2
    CLIPW      m1, [pb_0], [pw_pixel_max]
    mova  [r0+r2], m1
    add        r2, mmsize
    jl .loop
    REP_RET

;-----------------------------------------------------------------------------
; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h, 3,4,8
    %define src r1+r2
    add        r2, r2
    add        r0, r2
    add        r1, r2
    neg        r2
    mova       m0, [pw_pixel_max]
.loop:
    movu       m1, [src-4]
    movu       m2, [src-2]
    mova       m3, [src+0]
    movu       m6, [src+2]
    movu       m4, [src+4]
    movu       m5, [src+6]
    paddw      m3, m6 ; c0
    paddw      m2, m4 ; b0
    paddw      m1, m5 ; a0
%if mmsize == 16
    movu       m4, [src-4+mmsize]
    movu       m5, [src-2+mmsize]
%endif
    movu       m7, [src+4+mmsize]
    movu       m6, [src+6+mmsize]
    paddw      m5, m7 ; b1
    paddw      m4, m6 ; a1
    movu       m7, [src+2+mmsize]
    mova       m6, [src+0+mmsize]
    paddw      m6, m7 ; c1
    FILT_H2    m1, m2, m3, m4, m5, m6
    mova       m7, [pw_1]
    pxor       m2, m2
    FILT_PACK  m1, m4, 1, m7, w
    CLIPW      m1, m2, m0
    CLIPW      m4, m2, m0
    mova      [r0+r2], m1
    mova      [r0+r2+mmsize], m4
    add        r2, mmsize*2
    jl .loop
    REP_RET
%endmacro ; HPEL_FILTER

INIT_MMX mmx2
HPEL_FILTER
INIT_XMM sse2
HPEL_FILTER
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
%macro HPEL_V 1
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_v, 5,6,%1
%ifdef WIN64
    movsxd   r4, r4d
%endif
    lea r5, [r1+r3]
    sub r1, r3
    sub r1, r3
    add r0, r4
    lea r2, [r2+r4*2]
    neg r4
%if cpuflag(ssse3)
    mova m0, [filt_mul15]
%else
    pxor m0, m0
%endif
.loop:
%if cpuflag(ssse3)
    mova m1, [r1]
    mova m4, [r1+r3]
    mova m2, [r5+r3*2]
    mova m5, [r5+r3]
    mova m3, [r1+r3*2]
    mova m6, [r5]
    SBUTTERFLY bw, 1, 4, 7
    SBUTTERFLY bw, 2, 5, 7
    SBUTTERFLY bw, 3, 6, 7
    pmaddubsw m1, m0
    pmaddubsw m4, m0
    pmaddubsw m2, m0
    pmaddubsw m5, m0
    pmaddubsw m3, [filt_mul20]
    pmaddubsw m6, [filt_mul20]
    paddw  m1, m2
    paddw  m4, m5
    paddw  m1, m3
    paddw  m4, m6
%else
    LOAD_ADD_2 m1, m4, [r1     ], [r5+r3*2], m6, m7            ; a0 / a1
    LOAD_ADD_2 m2, m5, [r1+r3  ], [r5+r3  ], m6, m7            ; b0 / b1
    LOAD_ADD   m3,     [r1+r3*2], [r5     ], m7                ; c0
    LOAD_ADD   m6,     [r1+r3*2+mmsize/2], [r5+mmsize/2], m7   ; c1
    FILT_V2 m1, m2, m3, m4, m5, m6
%endif
    mova      m7, [pw_16]
    mova      [r2+r4*2], m1
    mova      [r2+r4*2+mmsize], m4
    FILT_PACK m1, m4, 5, m7
    movnta    [r0+r4], m1
    add r1, mmsize
    add r5, mmsize
    add r4, mmsize
    jl .loop
    REP_RET
%endmacro

;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
INIT_MMX
cglobal hpel_filter_c_mmx2, 3,3
    add r0, r2
    lea r1, [r1+r2*2]
    neg r2
    %define src r1+r2*2
    movq m7, [pw_32]
.loop:
    movq   m1, [src-4]
    movq   m2, [src-2]
    movq   m3, [src  ]
    movq   m4, [src+4]
    movq   m5, [src+6]
    paddw  m3, [src+2]  ; c0
    paddw  m2, m4       ; b0
    paddw  m1, m5       ; a0
    movq   m6, [src+8]
    paddw  m4, [src+14] ; a1
    paddw  m5, [src+12] ; b1
    paddw  m6, [src+10] ; c1
    FILT_H2 m1, m2, m3, m4, m5, m6
    FILT_PACK m1, m4, 6, m7
    movntq [r0+r2], m1
    add r2, 8
    jl .loop
    REP_RET

;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_mmx2, 3,3
    add r0, r2
    add r1, r2
    neg r2
    %define src r1+r2
    pxor m0, m0
.loop:
    movd       m1, [src-2]
    movd       m2, [src-1]
    movd       m3, [src  ]
    movd       m6, [src+1]
    movd       m4, [src+2]
    movd       m5, [src+3]
    punpcklbw  m1, m0
    punpcklbw  m2, m0
    punpcklbw  m3, m0
    punpcklbw  m6, m0
    punpcklbw  m4, m0
    punpcklbw  m5, m0
    paddw      m3, m6 ; c0
    paddw      m2, m4 ; b0
    paddw      m1, m5 ; a0
    movd       m7, [src+7]
    movd       m6, [src+6]
    punpcklbw  m7, m0
    punpcklbw  m6, m0
    paddw      m4, m7 ; c1
    paddw      m5, m6 ; b1
    movd       m7, [src+5]
    movd       m6, [src+4]
    punpcklbw  m7, m0
    punpcklbw  m6, m0
    paddw      m6, m7 ; a1
    movq       m7, [pw_1]
    FILT_H2 m1, m2, m3, m4, m5, m6
    FILT_PACK m1, m4, 1, m7
    movntq     [r0+r2], m1
    add r2, 8
    jl .loop
    REP_RET

INIT_XMM

%macro HPEL_C 0
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,9
    add r0, r2
    lea r1, [r1+r2*2]
    neg r2
    %define src r1+r2*2
%ifnidn cpuname, sse2
    mova    m7, [pw_32]
    %define tpw_32 m7
%elifdef ARCH_X86_64
    mova    m8, [pw_32]
    %define tpw_32 m8
%else
    %define tpw_32 [pw_32]
%endif
%if cpuflag(misalign)
.loop:
    movu    m4, [src-4]
    movu    m5, [src-2]
    mova    m6, [src]
    movu    m3, [src+12]
    movu    m2, [src+14]
    mova    m1, [src+16]
    paddw   m4, [src+6]
    paddw   m5, [src+4]
    paddw   m6, [src+2]
    paddw   m3, [src+22]
    paddw   m2, [src+20]
    paddw   m1, [src+18]
    FILT_H2 m4, m5, m6, m3, m2, m1
%else
    mova      m0, [src-16]
    mova      m1, [src]
.loop:
    mova      m2, [src+16]
    PALIGNR   m4, m1, m0, 12, m7
    PALIGNR   m5, m1, m0, 14, m0
    PALIGNR   m0, m2, m1, 6, m7
    paddw     m4, m0
    PALIGNR   m0, m2, m1, 4, m7
    paddw     m5, m0
    PALIGNR   m6, m2, m1, 2, m7
    paddw     m6, m1
    FILT_H    m4, m5, m6

    mova      m0, m2
    mova      m5, m2
    PALIGNR   m2, m1, 12, m7
    PALIGNR   m5, m1, 14, m1
    mova      m1, [src+32]
    PALIGNR   m3, m1, m0, 6, m7
    paddw     m3, m2
    PALIGNR   m6, m1, m0, 4, m7
    paddw     m5, m6
    PALIGNR   m6, m1, m0, 2, m7
    paddw     m6, m0
    FILT_H    m3, m5, m6
%endif
    FILT_PACK m4, m3, 6, tpw_32
    movntps [r0+r2], m4
    add r2, 16
    jl .loop
    REP_RET
%endmacro

;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_sse2, 3,3,8
    add r0, r2
    add r1, r2
    neg r2
    %define src r1+r2
    pxor m0, m0
.loop:
    movh       m1, [src-2]
    movh       m2, [src-1]
    movh       m3, [src  ]
    movh       m4, [src+1]
    movh       m5, [src+2]
    movh       m6, [src+3]
    punpcklbw  m1, m0
    punpcklbw  m2, m0
    punpcklbw  m3, m0
    punpcklbw  m4, m0
    punpcklbw  m5, m0
    punpcklbw  m6, m0
    paddw      m3, m4 ; c0
    paddw      m2, m5 ; b0
    paddw      m1, m6 ; a0
    movh       m4, [src+6]
    movh       m5, [src+7]
    movh       m6, [src+10]
    movh       m7, [src+11]
    punpcklbw  m4, m0
    punpcklbw  m5, m0
    punpcklbw  m6, m0
    punpcklbw  m7, m0
    paddw      m5, m6 ; b1
    paddw      m4, m7 ; a1
    movh       m6, [src+8]
    movh       m7, [src+9]
    punpcklbw  m6, m0
    punpcklbw  m7, m0
    paddw      m6, m7 ; c1
    mova       m7, [pw_1] ; FIXME xmm8
    FILT_H2 m1, m2, m3, m4, m5, m6
    FILT_PACK m1, m4, 1, m7
    movntps    [r0+r2], m1
    add r2, 16
    jl .loop
    REP_RET

%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_ssse3, 3,3
    add r0, r2
    add r1, r2
    neg r2
    %define src r1+r2
    mova      m0, [src-16]
    mova      m1, [src]
    mova      m7, [pw_16]
.loop:
    mova      m2, [src+16]
    palignr   m3, m1, m0, 14
    palignr   m4, m1, m0, 15
    palignr   m0, m2, m1, 2
    pmaddubsw m3, [filt_mul15]
    pmaddubsw m4, [filt_mul15]
    pmaddubsw m0, [filt_mul51]
    palignr   m5, m2, m1, 1
    palignr   m6, m2, m1, 3
    paddw     m3, m0
    mova      m0, m1
    pmaddubsw m1, [filt_mul20]
    pmaddubsw m5, [filt_mul20]
    pmaddubsw m6, [filt_mul51]
    paddw     m3, m1
    paddw     m4, m5
    paddw     m4, m6
    FILT_PACK m3, m4, 5, m7
    pshufb    m3, [hpel_shuf]
    mova      m1, m2
    movntps [r0+r2], m3
    add r2, 16
    jl .loop
    REP_RET
%endif ; !ARCH_X86_64

INIT_MMX mmx2
HPEL_V 0
INIT_XMM sse2
HPEL_V 8
INIT_XMM sse2, misalign
HPEL_C
%ifndef ARCH_X86_64
INIT_XMM sse2
HPEL_C
INIT_XMM ssse3
HPEL_C
HPEL_V 0
INIT_XMM avx
HPEL_C
HPEL_V 0
%endif

%ifdef ARCH_X86_64
%macro DO_FILT_V 5
    ;The optimum prefetch distance is difficult to determine in checkasm:
    ;any prefetch seems slower than not prefetching.
    ;In real use, the prefetch seems to be a slight win.
    ;+16 is picked somewhat arbitrarily here based on the fact that even one
    ;loop iteration is going to take longer than the prefetch.
    prefetcht0 [r1+r2*2+16]
%if cpuflag(ssse3)
    mova m1, [r3]
    mova m2, [r3+r2]
    mova %3, [r3+r2*2]
    mova m3, [r1]
    mova %1, [r1+r2]
    mova %2, [r1+r2*2]
    punpckhbw m4, m1, m2
    punpcklbw m1, m2
    punpckhbw m2, %1, %2
    punpcklbw %1, %2
    punpckhbw %2, m3, %3
    punpcklbw m3, %3

    pmaddubsw m1, m12
    pmaddubsw m4, m12
    pmaddubsw %1, m0
    pmaddubsw m2, m0
    pmaddubsw m3, m14
    pmaddubsw %2, m14

    paddw m1, %1
    paddw m4, m2
    paddw m1, m3
    paddw m4, %2
%else
    LOAD_ADD_2 m1, m4, [r3     ], [r1+r2*2], m2, m5            ; a0 / a1
    LOAD_ADD_2 m2, m5, [r3+r2  ], [r1+r2  ], m3, m6            ; b0 / b1
    LOAD_ADD_2 m3, m6, [r3+r2*2], [r1     ], %3, %4            ; c0 / c1
    packuswb %3, %4
    FILT_V2 m1, m2, m3, m4, m5, m6
%endif
    add       r3, 16
    add       r1, 16
    mova      %1, m1
    mova      %2, m4
    FILT_PACK m1, m4, 5, m15
    movntps  [r11+r4+%5], m1
%endmacro

%macro FILT_C 4
    PALIGNR   m1, %2, %1, 12, m2
    PALIGNR   m2, %2, %1, 14, %1
    PALIGNR   m3, %3, %2, 4, %1
    PALIGNR   m4, %3, %2, 2, %1
    paddw     m3, m2
    mova      %1, %3
    PALIGNR   %3, %2, 6, m2
    paddw     m4, %2
    paddw     %3, m1
    FILT_H    %3, m3, m4
%endmacro

%macro DO_FILT_C 4
    FILT_C %1, %2, %3, 6
    FILT_C %2, %1, %4, 6
    FILT_PACK %3, %4, 6, m15
    movntps   [r5+r4], %3
%endmacro

%macro ADD8TO16 5
    punpckhbw %3, %1, %5
    punpcklbw %1, %5
    punpcklbw %4, %2, %5
    punpckhbw %2, %5
    paddw     %2, %3
    paddw     %1, %4
%endmacro

%macro DO_FILT_H 3
    PALIGNR   m1, %2, %1, 14, m3
    PALIGNR   m2, %2, %1, 15, m3
    PALIGNR   m4, %3, %2, 1 , m3
    PALIGNR   m5, %3, %2, 2 , m3
    PALIGNR   m6, %3, %2, 3 , m3
    mova      %1, %2
%if cpuflag(ssse3)
    pmaddubsw m1, m12
    pmaddubsw m2, m12
    pmaddubsw %2, m14
    pmaddubsw m4, m14
    pmaddubsw m5, m0
    pmaddubsw m6, m0
    paddw     m1, %2
    paddw     m2, m4
    paddw     m1, m5
    paddw     m2, m6
    FILT_PACK m1, m2, 5, m15
    pshufb    m1, [hpel_shuf]
%else ; ssse3, avx
    ADD8TO16  m1, m6, m12, m3, m0 ; a
    ADD8TO16  m2, m5, m12, m3, m0 ; b
    ADD8TO16  %2, m4, m12, m3, m0 ; c
    FILT_V2   m1, m2, %2, m6, m5, m4
    FILT_PACK m1, m6, 5, m15
%endif
    movntps [r0+r4], m1
    mova      %2, %3
%endmacro

%macro HPEL 0
;-----------------------------------------------------------------------------
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
;                   uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
cglobal hpel_filter, 7,7,16
%ifdef WIN64
    movsxd   r4, r4d
    movsxd   r5, r5d
%endif
    mov      r10, r3
    sub       r5, 16
    mov      r11, r1
    and      r10, 15
    sub       r3, r10
    add       r0, r5
    add      r11, r5
    add      r10, r5
    add       r5, r2
    mov       r2, r4
    neg      r10
    lea       r1, [r3+r2]
    sub       r3, r2
    sub       r3, r2
    mov       r4, r10
    mova     m15, [pw_16]
%if cpuflag(ssse3)
    mova      m0, [filt_mul51]
    mova     m12, [filt_mul15]
    mova     m14, [filt_mul20]
%else
    pxor      m0, m0
%endif
;ALIGN 16
.loopy:
; first filter_v
    DO_FILT_V m8, m7, m13, m12, 0
;ALIGN 16
.loopx:
    DO_FILT_V m6, m5, m11, m12, 16
.lastx:
    paddw   m15, m15 ; pw_32
    DO_FILT_C m9, m8, m7, m6
    psrlw   m15, 1 ; pw_16
    movdqa   m7, m5
    DO_FILT_H m10, m13, m11
    add      r4, 16
    jl .loopx
    cmp      r4, 16
    jl .lastx
; setup regs for next y
    sub      r4, r10
    sub      r4, r2
    sub      r1, r4
    sub      r3, r4
    add      r0, r2
    add     r11, r2
    add      r5, r2
    mov      r4, r10
    sub     r6d, 1
    jg .loopy
    sfence
    RET
%endmacro

INIT_XMM sse2
HPEL
INIT_XMM ssse3
HPEL
INIT_XMM avx
HPEL
%endif ; ARCH_X86_64

%undef movntq
%undef movntps
%undef sfence
%endif ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void plane_copy_core( pixel *dst, int i_dst,
;                       pixel *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>w
INIT_MMX
cglobal plane_copy_core_mmx2, 6,7
    FIX_STRIDES r1d, r3d, r4d
    movsxdifnidn r1, r1d
    movsxdifnidn r3, r3d
    movsxdifnidn r4, r4d
    sub    r1,  r4
    sub    r3,  r4
.loopy:
    mov    r6d, r4d
    sub    r6d, 63
.loopx:
    prefetchnta [r2+256]
    movq   m0, [r2   ]
    movq   m1, [r2+ 8]
    movntq [r0   ], m0
    movntq [r0+ 8], m1
    movq   m2, [r2+16]
    movq   m3, [r2+24]
    movntq [r0+16], m2
    movntq [r0+24], m3
    movq   m4, [r2+32]
    movq   m5, [r2+40]
    movntq [r0+32], m4
    movntq [r0+40], m5
    movq   m6, [r2+48]
    movq   m7, [r2+56]
    movntq [r0+48], m6
    movntq [r0+56], m7
    add    r2,  64
    add    r0,  64
    sub    r6d, 64
    jg .loopx
    prefetchnta [r2+256]
    add    r6d, 63
    jle .end16
.loop16:
    movq   m0, [r2  ]
    movq   m1, [r2+8]
    movntq [r0  ], m0
    movntq [r0+8], m1
    add    r2,  16
    add    r0,  16
    sub    r6d, 16
    jg .loop16
.end16:
    add    r0, r1
    add    r2, r3
    dec    r5d
    jg .loopy
    sfence
    emms
    RET


%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
%ifdef HIGH_BIT_DEPTH
%assign x 0
%rep 16/mmsize
    mov%4     m0, [%2+(x/2)*mmsize]
    mov%4     m1, [%3+(x/2)*mmsize]
    punpckhwd m2, m0, m1
    punpcklwd m0, m1
    mov%5a    [%1+(x+0)*mmsize], m0
    mov%5a    [%1+(x+1)*mmsize], m2
    %assign x (x+2)
%endrep
%else
    movq   m0, [%2]
%if mmsize==16
%ifidn %4, a
    punpcklbw m0, [%3]
%else
    movq   m1, [%3]
    punpcklbw m0, m1
%endif
    mov%5a [%1], m0
%else
    movq   m1, [%3]
    punpckhbw m2, m0, m1
    punpcklbw m0, m1
    mov%5a [%1+0], m0
    mov%5a [%1+8], m2
%endif
%endif ; HIGH_BIT_DEPTH
%endmacro

%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
%ifdef HIGH_BIT_DEPTH
%assign n 0
%rep 16/mmsize
    mova     m0, [%3+(n+0)*mmsize]
    mova     m1, [%3+(n+1)*mmsize]
    psrld    m2, m0, 16
    psrld    m3, m1, 16
    pand     m0, %5
    pand     m1, %5
    packssdw m0, m1
    packssdw m2, m3
    mov%6    [%1+(n/2)*mmsize], m0
    mov%6    [%2+(n/2)*mmsize], m2
    %assign n (n+2)
%endrep
%else ; !HIGH_BIT_DEPTH
%if mmsize==16
    mova   m0, [%3]
%if cpuflag(ssse3)
    pshufb m0, %5
%else
    mova   m1, m0
    pand   m0, %5
    psrlw  m1, 8
    packuswb m0, m1
%endif
%if %4
    mova   [%1], m0
%else
    movq   [%1], m0
    movhps [%2], m0
%endif
%else
    mova   m0, [%3]
    mova   m1, [%3+8]
    mova   m2, m0
    mova   m3, m1
    pand   m0, %5
    pand   m1, %5
    psrlw  m2, 8
    psrlw  m3, 8
    packuswb m0, m1
    packuswb m2, m3
    mova   [%1], m0
    mova   [%2], m2
%endif ; mmsize == 16
%endif ; HIGH_BIT_DEPTH
%endmacro

%macro PLANE_INTERLEAVE 0
;-----------------------------------------------------------------------------
; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
;                                  uint8_t *srcu, int i_srcu,
;                                  uint8_t *srcv, int i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
cglobal plane_copy_interleave_core, 7,7
    FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
    mov   r1m, r1d
    mov   r3m, r3d
    mov   r6m, r6d
%endif
    movsxdifnidn r1, r1d
    movsxdifnidn r3, r3d
    movsxdifnidn r5, r5d
    movsxdifnidn r6, r6d
    lea    r0, [r0+r6*2]
    add    r2,  r6
    add    r4,  r6
%ifdef ARCH_X86_64
    DECLARE_REG_TMP 10,11
%else
    DECLARE_REG_TMP 1,3
%endif
    mov  t1, r1
    shr  t1, SIZEOF_PIXEL
    sub  t1, r6
    mov  t0d, r7m
.loopy:
    mov    r6d, r6m
    neg    r6
.prefetch:
    prefetchnta [r2+r6]
    prefetchnta [r4+r6]
    add    r6, 64
    jl .prefetch
    mov    r6d, r6m
    neg    r6
.loopx:
    INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
    INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
    add    r6, 16*SIZEOF_PIXEL
    jl .loopx
.pad:
%assign n 0
%rep SIZEOF_PIXEL
%if mmsize==8
    movntq [r0+r6*2+(n+ 0)], m0
    movntq [r0+r6*2+(n+ 8)], m0
    movntq [r0+r6*2+(n+16)], m0
    movntq [r0+r6*2+(n+24)], m0
%else
    movntdq [r0+r6*2+(n+ 0)], m0
    movntdq [r0+r6*2+(n+16)], m0
%endif
    %assign n n+32
%endrep
    add    r6, 16*SIZEOF_PIXEL
    cmp    r6, t1
    jl .pad
    add    r0, r1mp
    add    r2, r3mp
    add    r4, r5
    dec    t0d
    jg .loopy
    sfence
    emms
    RET

;-----------------------------------------------------------------------------
; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
;-----------------------------------------------------------------------------
cglobal store_interleave_chroma, 5,5
    FIX_STRIDES r1d
.loop:
    INTERLEAVE r0+ 0, r2+           0, r3+           0, a
    INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
    add    r2, FDEC_STRIDEB*2
    add    r3, FDEC_STRIDEB*2
    lea    r0, [r0+r1*2]
    sub   r4d, 2
    jg .loop
    REP_RET
%endmacro ; PLANE_INTERLEAVE

%macro DEINTERLEAVE_START 0
%ifdef HIGH_BIT_DEPTH
    mova   m4, [pd_ffff]
%elif cpuflag(ssse3)
    mova   m4, [deinterleave_shuf]
%else
    mova   m4, [pw_00ff]
%endif ; HIGH_BIT_DEPTH
%endmacro

%macro PLANE_DEINTERLEAVE 0
;-----------------------------------------------------------------------------
; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
;                               pixel *dstv, int i_dstv,
;                               pixel *src, int i_src, int w, int h )
;-----------------------------------------------------------------------------
cglobal plane_copy_deinterleave, 6,7
    DEINTERLEAVE_START
    mov    r6d, r6m
    FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
    mov    r6m, r6d
%endif
    movsxdifnidn r1, r1d
    movsxdifnidn r3, r3d
    movsxdifnidn r5, r5d
    add    r0,  r6
    add    r2,  r6
    lea    r4, [r4+r6*2]
.loopy:
    mov    r6d, r6m
    neg    r6
.loopx:
    DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
    DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
    add    r6, 16*SIZEOF_PIXEL
    jl .loopx
    add    r0, r1
    add    r2, r3
    add    r4, r5
    dec dword r7m
    jg .loopy
    REP_RET

;-----------------------------------------------------------------------------
; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
;-----------------------------------------------------------------------------
cglobal load_deinterleave_chroma_fenc, 4,4
    DEINTERLEAVE_START
    FIX_STRIDES r2d
.loop:
    DEINTERLEAVE r0+           0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
    DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
    add    r0, FENC_STRIDEB*2
    lea    r1, [r1+r2*2]
    sub   r3d, 2
    jg .loop
    REP_RET

;-----------------------------------------------------------------------------
; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
;-----------------------------------------------------------------------------
cglobal load_deinterleave_chroma_fdec, 4,4
    DEINTERLEAVE_START
    FIX_STRIDES r2d
.loop:
    DEINTERLEAVE r0+           0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
    DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
    add    r0, FDEC_STRIDEB*2
    lea    r1, [r1+r2*2]
    sub   r3d, 2
    jg .loop
    REP_RET
%endmacro ; PLANE_DEINTERLEAVE

%ifdef HIGH_BIT_DEPTH
INIT_MMX mmx2
PLANE_INTERLEAVE
INIT_MMX mmx
PLANE_DEINTERLEAVE
INIT_XMM sse2
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
INIT_XMM avx
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
%else
INIT_MMX mmx2
PLANE_INTERLEAVE
INIT_MMX mmx
PLANE_DEINTERLEAVE
INIT_XMM sse2
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
INIT_XMM ssse3
PLANE_DEINTERLEAVE
%endif

; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size.
; memzero SSE will fail for non-mod128.

;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
INIT_MMX
cglobal memcpy_aligned_mmx, 3,3
    test r2d, 16
    jz .copy32start
    movq mm0, [r1 + r2 - 16]
    movq mm1, [r1 + r2 -  8]
    movq [r0 + r2 - 16], mm0
    movq [r0 + r2 -  8], mm1
    sub  r2d, 16
.copy32start
    test r2d, r2d
    jz .ret
.copy32:
    movq mm0, [r1 + r2 - 32]
    movq mm1, [r1 + r2 - 24]
    movq mm2, [r1 + r2 - 16]
    movq mm3, [r1 + r2 -  8]
    movq [r0 + r2 - 32], mm0
    movq [r0 + r2 - 24], mm1
    movq [r0 + r2 - 16], mm2
    movq [r0 + r2 -  8], mm3
    sub  r2d, 32
    jg .copy32
.ret
    REP_RET

;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
cglobal memcpy_aligned_sse2, 3,3
    test r2d, 16
    jz .copy32
    movdqa xmm0, [r1 + r2 - 16]
    movdqa [r0 + r2 - 16], xmm0
    sub  r2d, 16
.copy32:
    test r2d, 32
    jz .copy64start
    movdqa xmm0, [r1 + r2 - 32]
    movdqa [r0 + r2 - 32], xmm0
    movdqa xmm1, [r1 + r2 - 16]
    movdqa [r0 + r2 - 16], xmm1
    sub  r2d, 32
.copy64start
    test r2d, r2d
    jz .ret
.copy64:
    movdqa xmm0, [r1 + r2 - 64]
    movdqa [r0 + r2 - 64], xmm0
    movdqa xmm1, [r1 + r2 - 48]
    movdqa [r0 + r2 - 48], xmm1
    movdqa xmm2, [r1 + r2 - 32]
    movdqa [r0 + r2 - 32], xmm2
    movdqa xmm3, [r1 + r2 - 16]
    movdqa [r0 + r2 - 16], xmm3
    sub  r2d, 64
    jg .copy64
.ret:
    REP_RET

;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 0
cglobal memzero_aligned, 2,2
    add  r0, r1
    neg  r1
    pxor m0, m0
.loop:
%assign i 0
%rep 8
    mova [r0 + r1 + i], m0
%assign i i+mmsize
%endrep
    add r1, mmsize*8
    jl .loop
    REP_RET
%endmacro

INIT_MMX mmx
MEMZERO
INIT_XMM sse2
MEMZERO



%ifndef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal integral_init4h_sse4, 3,4
    lea     r3, [r0+r2*2]
    add     r1, r2
    neg     r2
    pxor    m4, m4
.loop:
    movdqa  m0, [r1+r2]
    movdqa  m1, [r1+r2+16]
    palignr m1, m0, 8
    mpsadbw m0, m4, 0
    mpsadbw m1, m4, 0
    paddw   m0, [r0+r2*2]
    paddw   m1, [r0+r2*2+16]
    movdqa  [r3+r2*2   ], m0
    movdqa  [r3+r2*2+16], m1
    add     r2, 16
    jl .loop
    REP_RET

%macro INTEGRAL_INIT8H 0
cglobal integral_init8h, 3,4
    lea     r3, [r0+r2*2]
    add     r1, r2
    neg     r2
    pxor    m4, m4
.loop:
    movdqa  m0, [r1+r2]
    movdqa  m1, [r1+r2+16]
    palignr m1, m0, 8
    mpsadbw m2, m0, m4, 4
    mpsadbw m3, m1, m4, 4
    mpsadbw m0, m4, 0
    mpsadbw m1, m4, 0
    paddw   m0, [r0+r2*2]
    paddw   m1, [r0+r2*2+16]
    paddw   m0, m2
    paddw   m1, m3
    movdqa  [r3+r2*2   ], m0
    movdqa  [r3+r2*2+16], m1
    add     r2, 16
    jl .loop
    REP_RET
%endmacro

INIT_XMM sse4
INTEGRAL_INIT8H
INIT_XMM avx
INTEGRAL_INIT8H
%endif ; !HIGH_BIT_DEPTH

%macro INTEGRAL_INIT_8V 0
;-----------------------------------------------------------------------------
; void integral_init8v( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
cglobal integral_init8v, 3,3
    shl   r1, 1
    add   r0, r1
    lea   r2, [r0+r1*8]
    neg   r1
.loop:
    mova  m0, [r2+r1]
    mova  m1, [r2+r1+mmsize]
    psubw m0, [r0+r1]
    psubw m1, [r0+r1+mmsize]
    mova  [r0+r1], m0
    mova  [r0+r1+mmsize], m1
    add   r1, 2*mmsize
    jl .loop
    REP_RET
%endmacro

INIT_MMX mmx
INTEGRAL_INIT_8V
INIT_XMM sse2
INTEGRAL_INIT_8V

;-----------------------------------------------------------------------------
; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal integral_init4v_mmx, 3,5
    shl   r2, 1
    lea   r3, [r0+r2*4]
    lea   r4, [r0+r2*8]
    mova  m0, [r0+r2]
    mova  m4, [r4+r2]
.loop:
    mova  m1, m4
    psubw m1, m0
    mova  m4, [r4+r2-8]
    mova  m0, [r0+r2-8]
    paddw m1, m4
    mova  m3, [r3+r2-8]
    psubw m1, m0
    psubw m3, m0
    mova  [r0+r2-8], m1
    mova  [r1+r2-8], m3
    sub   r2, 8
    jge .loop
    REP_RET

INIT_XMM
cglobal integral_init4v_sse2, 3,5
    shl     r2, 1
    add     r0, r2
    add     r1, r2
    lea     r3, [r0+r2*4]
    lea     r4, [r0+r2*8]
    neg     r2
.loop:
    mova    m0, [r0+r2]
    mova    m1, [r4+r2]
    mova    m2, m0
    mova    m4, m1
    shufpd  m0, [r0+r2+16], 1
    shufpd  m1, [r4+r2+16], 1
    paddw   m0, m2
    paddw   m1, m4
    mova    m3, [r3+r2]
    psubw   m1, m0
    psubw   m3, m2
    mova  [r0+r2], m1
    mova  [r1+r2], m3
    add     r2, 16
    jl .loop
    REP_RET

cglobal integral_init4v_ssse3, 3,5
    shl     r2, 1
    add     r0, r2
    add     r1, r2
    lea     r3, [r0+r2*4]
    lea     r4, [r0+r2*8]
    neg     r2
.loop:
    mova    m2, [r0+r2]
    mova    m0, [r0+r2+16]
    mova    m4, [r4+r2]
    mova    m1, [r4+r2+16]
    palignr m0, m2, 8
    palignr m1, m4, 8
    paddw   m0, m2
    paddw   m1, m4
    mova    m3, [r3+r2]
    psubw   m1, m0
    psubw   m3, m2
    mova  [r0+r2], m1
    mova  [r1+r2], m3
    add     r2, 16
    jl .loop
    REP_RET

%macro FILT8x4 7
    mova      %3, [r0+%7]
    mova      %4, [r0+r5+%7]
    pavgb     %3, %4
    pavgb     %4, [r0+r5*2+%7]
    PALIGNR   %1, %3, 1, m6
    PALIGNR   %2, %4, 1, m6
    pavgb     %1, %3
    pavgb     %2, %4
    psrlw     %5, %1, 8
    psrlw     %6, %2, 8
    pand      %1, m7
    pand      %2, m7
%endmacro

%macro FILT16x2 4
    mova      m3, [r0+%4+mmsize]
    mova      m2, [r0+%4]
    pavgb     m3, [r0+%4+r5+mmsize]
    pavgb     m2, [r0+%4+r5]
    PALIGNR   %1, m3, 1, m6
    pavgb     %1, m3
    PALIGNR   m3, m2, 1, m6
    pavgb     m3, m2
    psrlw     m5, m3, 8
    psrlw     m4, %1, 8
    pand      m3, m7
    pand      %1, m7
    packuswb  m3, %1
    packuswb  m5, m4
    mova    [%2], m3
    mova    [%3], m5
    mova      %1, m2
%endmacro

%macro FILT8x2U 3
    mova      m3, [r0+%3+8]
    mova      m2, [r0+%3]
    pavgb     m3, [r0+%3+r5+8]
    pavgb     m2, [r0+%3+r5]
    mova      m1, [r0+%3+9]
    mova      m0, [r0+%3+1]
    pavgb     m1, [r0+%3+r5+9]
    pavgb     m0, [r0+%3+r5+1]
    pavgb     m1, m3
    pavgb     m0, m2
    psrlw     m3, m1, 8
    psrlw     m2, m0, 8
    pand      m1, m7
    pand      m0, m7
    packuswb  m0, m1
    packuswb  m2, m3
    mova    [%1], m0
    mova    [%2], m2
%endmacro

%macro FILT8xU 3
    mova      m3, [r0+%3+8]
    mova      m2, [r0+%3]
    pavgw     m3, [r0+%3+r5+8]
    pavgw     m2, [r0+%3+r5]
    movu      m1, [r0+%3+10]
    movu      m0, [r0+%3+2]
    pavgw     m1, [r0+%3+r5+10]
    pavgw     m0, [r0+%3+r5+2]
    pavgw     m1, m3
    pavgw     m0, m2
    psrld     m3, m1, 16
    psrld     m2, m0, 16
    pand      m1, m7
    pand      m0, m7
    packssdw  m0, m1
    packssdw  m2, m3
    movu    [%1], m0
    mova    [%2], m2
%endmacro

%macro FILT8xA 4
    mova      m3, [r0+%4+mmsize]
    mova      m2, [r0+%4]
    pavgw     m3, [r0+%4+r5+mmsize]
    pavgw     m2, [r0+%4+r5]
    PALIGNR   %1, m3, 2, m6
    pavgw     %1, m3
    PALIGNR   m3, m2, 2, m6
    pavgw     m3, m2
    psrld     m5, m3, 16
    psrld     m4, %1, 16
    pand      m3, m7
    pand      %1, m7
    packssdw  m3, %1
    packssdw  m5, m4
    mova    [%2], m3
    mova    [%3], m5
    mova      %1, m2
%endmacro

;-----------------------------------------------------------------------------
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
;                              int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 0
cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
%ifdef HIGH_BIT_DEPTH
    shl   dword r6m, 1
    FIX_STRIDES r5d
    shl   dword r7m, 1
%endif
%ifdef WIN64
    movsxd    r5, r5d
%endif
    ; src += 2*(height-1)*stride + 2*width
    mov      r6d, r8m
    dec      r6d
    imul     r6d, r5d
    add      r6d, r7m
    lea       r0, [r0+r6*2]
    ; dst += (height-1)*stride + width
    mov      r6d, r8m
    dec      r6d
    imul     r6d, r6m
    add      r6d, r7m
    add       r1, r6
    add       r2, r6
    add       r3, r6
    add       r4, r6
    ; gap = stride - width
    mov      r6d, r6m
    sub      r6d, r7m
    PUSH      r6
    %define dst_gap [rsp+gprsize]
    mov      r6d, r5d
    sub      r6d, r7m
    shl      r6d, 1
    PUSH      r6
    %define src_gap [rsp]
%ifdef HIGH_BIT_DEPTH
    pcmpeqw   m7, m7
    psrld     m7, 16
.vloop:
    mov      r6d, r7m
%ifnidn cpuname, mmx2
    mova      m0, [r0]
    mova      m1, [r0+r5]
    pavgw     m0, m1
    pavgw     m1, [r0+r5*2]
%endif
.hloop:
    sub       r0, mmsize*2
    sub       r1, mmsize
    sub       r2, mmsize
    sub       r3, mmsize
    sub       r4, mmsize
%ifidn cpuname, mmx2
    FILT8xU r1, r2, 0
    FILT8xU r3, r4, r5
%else
    FILT8xA m0, r1, r2, 0
    FILT8xA m1, r3, r4, r5
%endif
    sub      r6d, mmsize
    jg .hloop
%else ; !HIGH_BIT_DEPTH
%if mmsize == 16
    ; adjust for the odd end case
    mov      r6d, r7m
    and      r6d, 8
    sub       r1, r6
    sub       r2, r6
    sub       r3, r6
    sub       r4, r6
    add  dst_gap, r6d
%endif ; mmsize
    pcmpeqb   m7, m7
    psrlw     m7, 8
.vloop:
    mov      r6d, r7m
%ifnidn cpuname, mmx2
    mova      m0, [r0]
    mova      m1, [r0+r5]
    pavgb     m0, m1
    pavgb     m1, [r0+r5*2]
%endif
%if mmsize == 16
    test     r6d, 8
    jz .hloop
    sub       r0, 16
    FILT8x4   m0, m1, m2, m3, m4, m5, 0
    packuswb  m0, m4
    packuswb  m1, m5
    movq    [r1], m0
    movhps  [r2], m0
    movq    [r3], m1
    movhps  [r4], m1
    mova      m0, m2
    mova      m1, m3
    sub      r6d, 8
    jz .skip
%endif ; mmsize
.hloop:
    sub       r0, mmsize*2
    sub       r1, mmsize
    sub       r2, mmsize
    sub       r3, mmsize
    sub       r4, mmsize
%ifdef m8
    FILT8x4   m0, m1, m2, m3, m10, m11, mmsize
    mova      m8, m0
    mova      m9, m1
    FILT8x4   m2, m3, m0, m1, m4, m5, 0
    packuswb  m2, m8
    packuswb  m3, m9
    packuswb  m4, m10
    packuswb  m5, m11
    mova    [r1], m2
    mova    [r2], m4
    mova    [r3], m3
    mova    [r4], m5
%elifidn cpuname, mmx2
    FILT8x2U  r1, r2, 0
    FILT8x2U  r3, r4, r5
%else
    FILT16x2  m0, r1, r2, 0
    FILT16x2  m1, r3, r4, r5
%endif
    sub      r6d, mmsize
    jg .hloop
%endif ; HIGH_BIT_DEPTH
.skip:
    mov       r6, dst_gap
    sub       r0, src_gap
    sub       r1, r6
    sub       r2, r6
    sub       r3, r6
    sub       r4, r6
    dec    dword r8m
    jg .vloop
    ADD      rsp, 2*gprsize
    emms
    RET
%endmacro ; FRAME_INIT_LOWRES

INIT_MMX mmx2
FRAME_INIT_LOWRES
%ifndef ARCH_X86_64
INIT_MMX cache32, mmx2
FRAME_INIT_LOWRES
%endif
INIT_XMM sse2
FRAME_INIT_LOWRES
INIT_XMM ssse3
FRAME_INIT_LOWRES

;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
%macro MBTREE 0
cglobal mbtree_propagate_cost, 7,7,7
    add        r6d, r6d
    lea         r0, [r0+r6*2]
    add         r1, r6
    add         r2, r6
    add         r3, r6
    add         r4, r6
    neg         r6
    pxor      xmm4, xmm4
    movss     xmm6, [r5]
    shufps    xmm6, xmm6, 0
    mulps     xmm6, [pf_inv256]
    movdqa    xmm5, [pw_3fff]
.loop:
    movq      xmm2, [r2+r6] ; intra
    movq      xmm0, [r4+r6] ; invq
    movq      xmm3, [r3+r6] ; inter
    movq      xmm1, [r1+r6] ; prop
    punpcklwd xmm2, xmm4
    punpcklwd xmm0, xmm4
    pmaddwd   xmm0, xmm2
    pand      xmm3, xmm5
    punpcklwd xmm1, xmm4
    punpcklwd xmm3, xmm4
%if cpuflag(fma4)
    cvtdq2ps  xmm0, xmm0
    cvtdq2ps  xmm1, xmm1
    vfmaddps  xmm0, xmm0, xmm6, xmm1
    cvtdq2ps  xmm1, xmm2
    psubd     xmm2, xmm3
    cvtdq2ps  xmm2, xmm2
    rcpps     xmm3, xmm1
    mulps     xmm1, xmm3
    mulps     xmm0, xmm2
    addps     xmm2, xmm3, xmm3
    vfnmaddps xmm3, xmm1, xmm3, xmm2
    mulps     xmm0, xmm3
%else
    cvtdq2ps  xmm0, xmm0
    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
    cvtdq2ps  xmm1, xmm1    ; prop
    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
    cvtdq2ps  xmm1, xmm2    ; intra
    psubd     xmm2, xmm3    ; intra - inter
    cvtdq2ps  xmm2, xmm2    ; intra - inter
    rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
    addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
    subps     xmm3, xmm1    ; 2nd approximation for 1/intra
    mulps     xmm0, xmm3    ; / intra
%endif
    cvtps2dq  xmm0, xmm0
    movdqa [r0+r6*2], xmm0
    add         r6, 8
    jl .loop
    REP_RET
%endmacro

INIT_XMM sse2
MBTREE
; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
INIT_XMM fma4
MBTREE

%macro INT16_TO_FLOAT 1
    vpunpckhwd   xmm4, xmm%1, xmm7
    vpunpcklwd  xmm%1, xmm7
    vinsertf128 ymm%1, ymm%1, xmm4, 1
    vcvtdq2ps   ymm%1, ymm%1
%endmacro

; FIXME: align loads/stores to 16 bytes
INIT_YMM avx
cglobal mbtree_propagate_cost, 7,7,8
    add           r6d, r6d
    lea            r0, [r0+r6*2]
    add            r1, r6
    add            r2, r6
    add            r3, r6
    add            r4, r6
    neg            r6
    vmovdqa      xmm5, [pw_3fff]
    vbroadcastss ymm6, [r5]
    vmulps       ymm6, ymm6, [pf_inv256]
    vpxor        xmm7, xmm7
.loop:
    vmovdqu      xmm0, [r2+r6]       ; intra
    vmovdqu      xmm1, [r4+r6]       ; invq
    vmovdqu      xmm2, [r1+r6]       ; prop
    vpand        xmm3, xmm5, [r3+r6] ; inter
    INT16_TO_FLOAT 0
    INT16_TO_FLOAT 1
    INT16_TO_FLOAT 2
    INT16_TO_FLOAT 3
    vmulps       ymm1, ymm1, ymm0
    vsubps       ymm4, ymm0, ymm3
    vmulps       ymm1, ymm1, ymm6    ; intra*invq*fps_factor>>8
    vaddps       ymm1, ymm1, ymm2    ; prop + (intra*invq*fps_factor>>8)
    vrcpps       ymm3, ymm0          ; 1 / intra 1st approximation
    vmulps       ymm2, ymm0, ymm3    ; intra * (1/intra 1st approx)
    vmulps       ymm2, ymm2, ymm3    ; intra * (1/intra 1st approx)^2
    vmulps       ymm1, ymm1, ymm4    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
    vaddps       ymm3, ymm3, ymm3    ; 2 * (1/intra 1st approx)
    vsubps       ymm3, ymm3, ymm2    ; 2nd approximation for 1/intra
    vmulps       ymm1, ymm1, ymm3    ; / intra
    vcvtps2dq    ymm1, ymm1
    vmovdqu [r0+r6*2], ymm1
    add            r6, 16
    jl .loop
    vzeroupper
    RET
x264-snapshot-20120103-2245-stable/common/x86/mc-a.asm0000644000175000001440000012662011700673342021134 0ustar  videolanusers;*****************************************************************************
;* mc-a.asm: x86 motion compensation
;*****************************************************************************
;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*          Laurent Aimar <fenrir@via.ecp.fr>
;*          Dylan Yudaken <dyudaken@gmail.com>
;*          Holger Lubitz <holger@lubitz.org>
;*          Min Chen <chenm001.163.com>
;*          Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA 32

ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
ch_shuf_adj: times 8 db 0
             times 8 db 2
             times 8 db 4
             times 8 db 6
sq_1: times 1 dq 1

SECTION .text

cextern pb_0
cextern pw_1
cextern pw_4
cextern pw_8
cextern pw_32
cextern pw_64
cextern pw_00ff
cextern pw_pixel_max
cextern sw_64
cextern pd_32

;=============================================================================
; implicit weighted biprediction
;=============================================================================
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
    DECLARE_REG_TMP 0,1,2,3,4,5,10,11
    %macro AVG_START 0-1 0
        PROLOGUE 6,7,%1
%ifdef WIN64
        movsxd r5, r5d
%endif
    %endmacro
%else
    DECLARE_REG_TMP 1,2,3,4,5,6,1,2
    %macro AVG_START 0-1 0
        PROLOGUE 0,7,%1
        mov t0, r0m
        mov t1, r1m
        mov t2, r2m
        mov t3, r3m
        mov t4, r4m
        mov t5, r5m
    %endmacro
%endif

%macro AVG_END 0
    lea  t4, [t4+t5*2*SIZEOF_PIXEL]
    lea  t2, [t2+t3*2*SIZEOF_PIXEL]
    lea  t0, [t0+t1*2*SIZEOF_PIXEL]
    sub eax, 2
    jg .height_loop
    REP_RET
%endmacro

%ifdef HIGH_BIT_DEPTH

%macro BIWEIGHT_MMX 2
    movh      m0, %1
    movh      m1, %2
    punpcklwd m0, m1
    pmaddwd   m0, m3
    paddd     m0, m4
    psrad     m0, 6
%endmacro

%macro BIWEIGHT_START_MMX 0
    movzx  t6d, word r6m
    mov    t7d, 64
    sub    t7d, t6d
    shl    t7d, 16
    add    t6d, t7d
    movd    m3, t6d
    SPLATD  m3, m3
    mova    m4, [pd_32]
    pxor    m5, m5
%endmacro

%else ;!HIGH_BIT_DEPTH
%macro BIWEIGHT_MMX 2
    movh      m0, %1
    movh      m1, %2
    punpcklbw m0, m5
    punpcklbw m1, m5
    pmullw    m0, m2
    pmullw    m1, m3
    paddw     m0, m1
    paddw     m0, m4
    psraw     m0, 6
%endmacro

%macro BIWEIGHT_START_MMX 0
    movd    m2, r6m
    SPLATW  m2, m2   ; weight_dst
    mova    m3, [pw_64]
    psubw   m3, m2   ; weight_src
    mova    m4, [pw_32] ; rounding
    pxor    m5, m5
%endmacro
%endif ;HIGH_BIT_DEPTH

%macro BIWEIGHT_SSSE3 2
    movh      m0, %1
    movh      m1, %2
    punpcklbw m0, m1
    pmaddubsw m0, m3
    paddw     m0, m4
    psraw     m0, 6
%endmacro

%macro BIWEIGHT_START_SSSE3 0
    movzx  t6d, byte r6m ; FIXME x86_64
    mov    t7d, 64
    sub    t7d, t6d
    shl    t7d, 8
    add    t6d, t7d
    movd    m3, t6d
    mova    m4, [pw_32]
    SPLATW  m3, m3   ; weight_dst,src
%endmacro

%ifdef HIGH_BIT_DEPTH
%macro BIWEIGHT_ROW 4
    BIWEIGHT   [%2], [%3]
%if %4==mmsize/4
    packssdw     m0, m0
    CLIPW        m0, m5, m7
    movh       [%1], m0
%else
    SWAP 0, 6
    BIWEIGHT   [%2+mmsize/2], [%3+mmsize/2]
    packssdw     m6, m0
    CLIPW        m6, m5, m7
    mova       [%1], m6
%endif
%endmacro

%else ;!HIGH_BIT_DEPTH
%macro BIWEIGHT_ROW 4
    BIWEIGHT [%2], [%3]
%if %4==mmsize/2
    packuswb   m0, m0
    movh     [%1], m0
%else
    SWAP 0, 6
    BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
    packuswb   m6, m0
    mova     [%1], m6
%endif
%endmacro

%endif ;HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 1-2 0
cglobal pixel_avg_weight_w%1
    BIWEIGHT_START
    AVG_START %2
%ifdef HIGH_BIT_DEPTH
    mova    m7, [pw_pixel_max]
%endif
.height_loop:
%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
    BIWEIGHT [t2], [t4]
    SWAP 0, 6
    BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
%ifdef HIGH_BIT_DEPTH
    packssdw m6, m0
    CLIPW    m6, m5, m7
%else ;!HIGH_BIT_DEPTH
    packuswb m6, m0
%endif ;HIGH_BIT_DEPTH
    movlps   [t0], m6
    movhps   [t0+SIZEOF_PIXEL*t1], m6
%else
%assign x 0
%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
    BIWEIGHT_ROW   t0+x,                   t2+x,                   t4+x,                 %1
    BIWEIGHT_ROW   t0+x+SIZEOF_PIXEL*t1,   t2+x+SIZEOF_PIXEL*t3,   t4+x+SIZEOF_PIXEL*t5, %1
%assign x x+mmsize
%endrep
%endif
    AVG_END
%endmacro

%define BIWEIGHT BIWEIGHT_MMX
%define BIWEIGHT_START BIWEIGHT_START_MMX
INIT_MMX mmx2
AVG_WEIGHT 4
AVG_WEIGHT 8
AVG_WEIGHT 16
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
AVG_WEIGHT 4,  8
AVG_WEIGHT 8,  8
AVG_WEIGHT 16, 8
%else ;!HIGH_BIT_DEPTH
INIT_XMM sse2
AVG_WEIGHT 8,  7
AVG_WEIGHT 16, 7
%define BIWEIGHT BIWEIGHT_SSSE3
%define BIWEIGHT_START BIWEIGHT_START_SSSE3
INIT_MMX ssse3
AVG_WEIGHT 4
INIT_XMM ssse3
AVG_WEIGHT 8,  7
AVG_WEIGHT 16, 7
%endif ;HIGH_BIT_DEPTH

;=============================================================================
; P frame explicit weighted prediction
;=============================================================================

%ifdef HIGH_BIT_DEPTH
%macro WEIGHT_START 1 ; (width)
    mova        m0, [r4+ 0]         ; 1<<denom
    mova        m3, [r4+16]
    movd        m2, [r4+32]         ; denom
    mova        m4, [pw_pixel_max]
    paddw       m2, [sq_1]          ; denom+1
%endmacro

%macro WEIGHT 2 ; (src1, src2)
    movh        m5, [%1]
    movh        m6, [%2]
    punpcklwd   m5, m0
    punpcklwd   m6, m0
    pmaddwd     m5, m3
    pmaddwd     m6, m3
    psrad       m5, m2
    psrad       m6, m2
    packssdw    m5, m6
%endmacro

%macro WEIGHT_TWO_ROW 3 ; (src, dst, width)
    %assign x 0
%rep (%3+mmsize/2-1)/(mmsize/2)
%if %3-x/2 <= 4 && mmsize == 16
    WEIGHT      %1+x, %1+r3+x
    CLIPW         m5, [pb_0], m4
    movh      [%2+x], m5
    movhps [%2+r1+x], m5
%else
    WEIGHT      %1+x, %1+x+mmsize/2
    SWAP           5,  7
    WEIGHT   %1+r3+x, %1+r3+x+mmsize/2
    CLIPW         m5, [pb_0], m4
    CLIPW         m7, [pb_0], m4
    mova      [%2+x], m7
    mova   [%2+r1+x], m5
%endif
    %assign x x+mmsize
%endrep
%endmacro

%else ; !HIGH_BIT_DEPTH

%macro WEIGHT_START 1
    mova     m3, [r4]
    mova     m6, [r4+16]
    movd     m5, [r4+32]
    pxor     m2, m2
%if (%1 == 20 || %1 == 12) && mmsize == 16
    movdq2q mm3, xmm3
    movdq2q mm4, xmm4
    movdq2q mm5, xmm5
    movdq2q mm6, xmm6
    pxor    mm2, mm2
%endif
%endmacro

%macro WEIGHT_START_SSSE3 1
    mova     m3, [r4]
    mova     m4, [r4+16]
    pxor     m2, m2
%if %1 == 20 || %1 == 12
    movdq2q mm3, xmm3
    movdq2q mm4, xmm4
    pxor    mm2, mm2
%endif
%endmacro

;; macro to weight mmsize bytes taking half from %1 and half from %2
%macro WEIGHT 2             ; (src1,src2)
    movh      m0, [%1]
    movh      m1, [%2]
    punpcklbw m0, m2        ;setup
    punpcklbw m1, m2        ;setup
    pmullw    m0, m3        ;scale
    pmullw    m1, m3        ;scale
    paddsw    m0, m6        ;1<<(denom-1)+(offset<<denom)
    paddsw    m1, m6        ;1<<(denom-1)+(offset<<denom)
    psraw     m0, m5        ;denom
    psraw     m1, m5        ;denom
%endmacro

%macro WEIGHT_SSSE3 2
    movh      m0, [%1]
    movh      m1, [%2]
    punpcklbw m0, m2
    punpcklbw m1, m2
    psllw     m0, 7
    psllw     m1, 7
    pmulhrsw  m0, m3
    pmulhrsw  m1, m3
    paddw     m0, m4
    paddw     m1, m4
%endmacro

%macro WEIGHT_SAVE_ROW 3        ;(src,dst,width)
%if %3 == 16
    mova     [%2], %1
%elif %3 == 8
    movq     [%2], %1
%else
    movd     [%2], %1       ; width 2 can write garbage for last 2 bytes
%endif
%endmacro

%macro WEIGHT_ROW 3         ; (src,dst,width)
    ;; load weights
    WEIGHT           %1, (%1+(mmsize/2))
    packuswb         m0, m1        ;put bytes into m0
    WEIGHT_SAVE_ROW  m0, %2, %3
%endmacro

%macro WEIGHT_SAVE_COL 2        ;(dst,size)
%if %2 == 8
    packuswb     m0, m1
    movq       [%1], m0
    movhps  [%1+r1], m0
%else
    packuswb     m0, m0
    packuswb     m1, m1
    movd       [%1], m0    ; width 2 can write garbage for last 2 bytes
    movd    [%1+r1], m1
%endif
%endmacro

%macro WEIGHT_COL 3     ; (src,dst,width)
%if %3 <= 4 && mmsize == 16
    INIT_MMX
    ;; load weights
    WEIGHT           %1, (%1+r3)
    WEIGHT_SAVE_COL  %2, %3
    INIT_XMM
%else
    WEIGHT           %1, (%1+r3)
    WEIGHT_SAVE_COL  %2, %3
%endif

%endmacro

%macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
%assign x 0
%rep %3
%if (%3-x) >= mmsize
    WEIGHT_ROW    (%1+x),    (%2+x), mmsize     ; weight 1 mmsize
    WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize     ; weight 1 mmsize
    %assign x (x+mmsize)
%else
    WEIGHT_COL (%1+x),(%2+x),(%3-x)
    %exitrep
%endif
%if x >= %3
    %exitrep
%endif
%endrep
%endmacro

%endif ; HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------

%ifdef ARCH_X86_64
%define NUMREGS 6
%define LOAD_HEIGHT
%define HEIGHT_REG r5d
%define TMP_REG r6d
%else
%define NUMREGS 5
%define TMP_REG r5d
%define LOAD_HEIGHT mov r4d, r5m
%define HEIGHT_REG r4d
%endif

%assign XMMREGS 7
%ifdef HIGH_BIT_DEPTH
%assign NUMREGS NUMREGS+1
%assign XMMREGS 8
%endif

%macro WEIGHTER 1
    cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
    FIX_STRIDES r1, r3
    WEIGHT_START %1
    LOAD_HEIGHT
.loop:
    WEIGHT_TWO_ROW r2, r0, %1
    lea  r0, [r0+r1*2]
    lea  r2, [r2+r3*2]
    sub HEIGHT_REG, 2
    jg .loop
    REP_RET
%endmacro

INIT_MMX mmx2
WEIGHTER  4
WEIGHTER  8
WEIGHTER 12
WEIGHTER 16
WEIGHTER 20
INIT_XMM sse2
WEIGHTER  8
WEIGHTER 16
WEIGHTER 20
%ifdef HIGH_BIT_DEPTH
WEIGHTER 12
INIT_XMM avx
WEIGHTER  8
WEIGHTER 12
WEIGHTER 16
WEIGHTER 20
%else
%define WEIGHT WEIGHT_SSSE3
%define WEIGHT_START WEIGHT_START_SSSE3
INIT_MMX ssse3
WEIGHTER  4
INIT_XMM ssse3
WEIGHTER  8
WEIGHTER 16
WEIGHTER 20
INIT_XMM avx
WEIGHTER  8
WEIGHTER 16
WEIGHTER 20
%endif

%macro OFFSET_OP 7
    mov%6        m0, [%1]
    mov%6        m1, [%2]
%ifdef HIGH_BIT_DEPTH
    p%5usw       m0, m2
    p%5usw       m1, m2
%ifidn %5,add
    pminsw       m0, m3
    pminsw       m1, m3
%endif
%else
    p%5usb       m0, m2
    p%5usb       m1, m2
%endif
    mov%7      [%3], m0
    mov%7      [%4], m1
%endmacro

%macro OFFSET_TWO_ROW 4
%assign x 0
%rep %3
%if (%3*SIZEOF_PIXEL-x) >= mmsize
    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
    %assign x (x+mmsize)
%else
%ifdef HIGH_BIT_DEPTH
    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
%else
    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
%endif
    %exitrep
%endif
%if x >= %3*SIZEOF_PIXEL
    %exitrep
%endif
%endrep
%endmacro

;-----------------------------------------------------------------------------
;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
%macro OFFSET 2
    cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
    FIX_STRIDES r1, r3
    mova m2, [r4]
%ifdef HIGH_BIT_DEPTH
%ifidn %2,add
    mova m3, [pw_pixel_max]
%endif
%endif
    LOAD_HEIGHT
.loop:
    OFFSET_TWO_ROW r2, r0, %1, %2
    lea  r0, [r0+r1*2]
    lea  r2, [r2+r3*2]
    sub HEIGHT_REG, 2
    jg .loop
    REP_RET
%endmacro

%macro OFFSETPN 1
       OFFSET %1, add
       OFFSET %1, sub
%endmacro
INIT_MMX mmx2
OFFSETPN  4
OFFSETPN  8
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
INIT_XMM sse2
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
INIT_XMM avx
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
OFFSETPN  8
INIT_XMM avx
OFFSETPN  8
%endif
%undef LOAD_HEIGHT
%undef HEIGHT_REG
%undef NUMREGS



;=============================================================================
; pixel avg
;=============================================================================

;-----------------------------------------------------------------------------
; void pixel_avg_4x4( pixel *dst, int dst_stride,
;                     pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
;-----------------------------------------------------------------------------
%macro AVGH 2
cglobal pixel_avg_%1x%2
    mov eax, %2
    cmp dword r6m, 32
    jne pixel_avg_weight_w%1 %+ SUFFIX
%if mmsize == 16 && %1 == 16
    test dword r4m, 15
    jz pixel_avg_w%1_sse2
%endif
    jmp pixel_avg_w%1_mmx2
%endmacro

;-----------------------------------------------------------------------------
; void pixel_avg_w4( pixel *dst, int dst_stride,
;                    pixel *src1, int src1_stride, pixel *src2, int src2_stride,
;                    int height, int weight );
;-----------------------------------------------------------------------------

%macro AVG_FUNC 3
cglobal pixel_avg_w%1
    AVG_START
.height_loop:
%assign x 0
%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
    %2     m0, [t2+x]
    %2     m1, [t2+x+SIZEOF_PIXEL*t3]
%ifdef HIGH_BIT_DEPTH
    pavgw  m0, [t4+x]
    pavgw  m1, [t4+x+SIZEOF_PIXEL*t5]
%else ;!HIGH_BIT_DEPTH
    pavgb  m0, [t4+x]
    pavgb  m1, [t4+x+SIZEOF_PIXEL*t5]
%endif
    %3     [t0+x], m0
    %3     [t0+x+SIZEOF_PIXEL*t1], m1
%assign x x+mmsize
%endrep
    AVG_END
%endmacro

%ifdef HIGH_BIT_DEPTH

INIT_MMX mmx2
AVG_FUNC 4, movq, movq
AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2

AVG_FUNC 8, movq, movq
AVGH 8, 16
AVGH 8,  8
AVGH 8,  4

AVG_FUNC 16, movq, movq
AVGH 16, 16
AVGH 16,  8

INIT_XMM sse2
AVG_FUNC 4, movq, movq
AVGH  4, 16
AVGH  4, 8
AVGH  4, 4
AVGH  4, 2

AVG_FUNC 8, movdqu, movdqa
AVGH  8, 16
AVGH  8,  8
AVGH  8,  4

AVG_FUNC 16, movdqu, movdqa
AVGH  16, 16
AVGH  16,  8

%else ;!HIGH_BIT_DEPTH

INIT_MMX mmx2
AVG_FUNC 4, movd, movd
AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2

AVG_FUNC 8, movq, movq
AVGH 8, 16
AVGH 8,  8
AVGH 8,  4

AVG_FUNC 16, movq, movq
AVGH 16, 16
AVGH 16, 8

INIT_XMM sse2
AVG_FUNC 16, movdqu, movdqa
AVGH 16, 16
AVGH 16,  8
AVGH  8, 16
AVGH  8,  8
AVGH  8,  4
INIT_XMM ssse3
AVGH 16, 16
AVGH 16,  8
AVGH  8, 16
AVGH  8,  8
AVGH  8,  4
INIT_MMX ssse3
AVGH  4, 16
AVGH  4,  8
AVGH  4,  4
AVGH  4,  2

%endif ;HIGH_BIT_DEPTH



;=============================================================================
; pixel avg2
;=============================================================================

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void pixel_avg2_wN( uint16_t *dst,  int dst_stride,
;                     uint16_t *src1, int src_stride,
;                     uint16_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W_ONE 1
cglobal pixel_avg2_w%1, 6,7,4
    sub     r4, r2
    lea     r6, [r4+r3*2]
.height_loop:
    movu    m0, [r2]
    movu    m1, [r2+r3*2]
%if mmsize == 8
    pavgw   m0, [r2+r4]
    pavgw   m1, [r2+r6]
%else
    movu    m2, [r2+r4]
    movu    m3, [r2+r6]
    pavgw   m0, m2
    pavgw   m1, m3
%endif
    mova   [r0], m0
    mova   [r0+r1*2], m1
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
    sub    r5d, 2
    jg .height_loop
    REP_RET
%endmacro

%macro AVG2_W_TWO 3
cglobal pixel_avg2_w%1, 6,7,8
    sub     r4, r2
    lea     r6, [r4+r3*2]
.height_loop:
    movu    m0, [r2]
    %2      m1, [r2+mmsize]
    movu    m2, [r2+r3*2]
    %2      m3, [r2+r3*2+mmsize]
%if mmsize == 8
    pavgw   m0, [r2+r4]
    pavgw   m1, [r2+r4+mmsize]
    pavgw   m2, [r2+r6]
    pavgw   m3, [r2+r6+mmsize]
%else
    movu    m4, [r2+r4]
    %2      m5, [r2+r4+mmsize]
    movu    m6, [r2+r6]
    %2      m7, [r2+r6+mmsize]
    pavgw   m0, m4
    pavgw   m1, m5
    pavgw   m2, m6
    pavgw   m3, m7
%endif
    mova   [r0], m0
    %3     [r0+mmsize], m1
    mova   [r0+r1*2], m2
    %3     [r0+r1*2+mmsize], m3
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
    sub    r5d, 2
    jg .height_loop
    REP_RET
%endmacro

INIT_MMX mmx2
AVG2_W_ONE  4
AVG2_W_TWO  8, movu, mova
INIT_XMM sse2
AVG2_W_ONE  8
AVG2_W_TWO 10, movd, movd
AVG2_W_TWO 16, movu, mova

INIT_MMX
cglobal pixel_avg2_w10_mmx2, 6,7
    sub     r4, r2
    lea     r6, [r4+r3*2]
.height_loop:
    movu    m0, [r2+ 0]
    movu    m1, [r2+ 8]
    movh    m2, [r2+16]
    movu    m3, [r2+r3*2+ 0]
    movu    m4, [r2+r3*2+ 8]
    movh    m5, [r2+r3*2+16]
    pavgw   m0, [r2+r4+ 0]
    pavgw   m1, [r2+r4+ 8]
    pavgw   m2, [r2+r4+16]
    pavgw   m3, [r2+r6+ 0]
    pavgw   m4, [r2+r6+ 8]
    pavgw   m5, [r2+r6+16]
    mova   [r0+ 0], m0
    mova   [r0+ 8], m1
    movh   [r0+16], m2
    mova   [r0+r1*2+ 0], m3
    mova   [r0+r1*2+ 8], m4
    movh   [r0+r1*2+16], m5
    lea     r2, [r2+r3*2*2]
    lea     r0, [r0+r1*2*2]
    sub    r5d, 2
    jg .height_loop
    REP_RET

cglobal pixel_avg2_w16_mmx2, 6,7
    sub     r4, r2
    lea     r6, [r4+r3*2]
.height_loop:
    movu    m0, [r2+ 0]
    movu    m1, [r2+ 8]
    movu    m2, [r2+16]
    movu    m3, [r2+24]
    movu    m4, [r2+r3*2+ 0]
    movu    m5, [r2+r3*2+ 8]
    movu    m6, [r2+r3*2+16]
    movu    m7, [r2+r3*2+24]
    pavgw   m0, [r2+r4+ 0]
    pavgw   m1, [r2+r4+ 8]
    pavgw   m2, [r2+r4+16]
    pavgw   m3, [r2+r4+24]
    pavgw   m4, [r2+r6+ 0]
    pavgw   m5, [r2+r6+ 8]
    pavgw   m6, [r2+r6+16]
    pavgw   m7, [r2+r6+24]
    mova   [r0+ 0], m0
    mova   [r0+ 8], m1
    mova   [r0+16], m2
    mova   [r0+24], m3
    mova   [r0+r1*2+ 0], m4
    mova   [r0+r1*2+ 8], m5
    mova   [r0+r1*2+16], m6
    mova   [r0+r1*2+24], m7
    lea     r2, [r2+r3*2*2]
    lea     r0, [r0+r1*2*2]
    sub    r5d, 2
    jg .height_loop
    REP_RET

cglobal pixel_avg2_w18_mmx2, 6,7
    sub     r4, r2
.height_loop:
    movu    m0, [r2+ 0]
    movu    m1, [r2+ 8]
    movu    m2, [r2+16]
    movu    m3, [r2+24]
    movh    m4, [r2+32]
    pavgw   m0, [r2+r4+ 0]
    pavgw   m1, [r2+r4+ 8]
    pavgw   m2, [r2+r4+16]
    pavgw   m3, [r2+r4+24]
    pavgw   m4, [r2+r4+32]
    mova   [r0+ 0], m0
    mova   [r0+ 8], m1
    mova   [r0+16], m2
    mova   [r0+24], m3
    movh   [r0+32], m4
    lea     r2, [r2+r3*2]
    lea     r0, [r0+r1*2]
    dec    r5d
    jg .height_loop
    REP_RET

INIT_XMM
cglobal pixel_avg2_w18_sse2, 6,7,6
    sub     r4, r2
.height_loop:
    movu    m0, [r2+ 0]
    movu    m1, [r2+16]
    movh    m2, [r2+32]
    movu    m3, [r2+r4+ 0]
    movu    m4, [r2+r4+16]
    movh    m5, [r2+r4+32]
    pavgw   m0, m3
    pavgw   m1, m4
    pavgw   m2, m5
    mova   [r0+ 0], m0
    mova   [r0+16], m1
    movh   [r0+32], m2
    lea     r2, [r2+r3*2]
    lea     r0, [r0+r1*2]
    dec    r5d
    jg .height_loop
    REP_RET
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
;                     uint8_t *src1, int src_stride,
;                     uint8_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
cglobal pixel_avg2_w%1_mmx2, 6,7
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    %2     mm0, [r2]
    %2     mm1, [r2+r3]
    pavgb  mm0, [r2+r4]
    pavgb  mm1, [r2+r6]
    lea    r2, [r2+r3*2]
    %2     [r0], mm0
    %2     [r0+r1], mm1
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET
%endmacro

INIT_MMX
AVG2_W8 4, movd
AVG2_W8 8, movq

%macro AVG2_W16 2
cglobal pixel_avg2_w%1_mmx2, 6,7
    sub    r2, r4
    lea    r6, [r2+r3]
.height_loop:
    movq   mm0, [r4]
    %2     mm1, [r4+8]
    movq   mm2, [r4+r3]
    %2     mm3, [r4+r3+8]
    pavgb  mm0, [r4+r2]
    pavgb  mm1, [r4+r2+8]
    pavgb  mm2, [r4+r6]
    pavgb  mm3, [r4+r6+8]
    lea    r4, [r4+r3*2]
    movq   [r0], mm0
    %2     [r0+8], mm1
    movq   [r0+r1], mm2
    %2     [r0+r1+8], mm3
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET
%endmacro

AVG2_W16 12, movd
AVG2_W16 16, movq

cglobal pixel_avg2_w20_mmx2, 6,7
    sub    r2, r4
    lea    r6, [r2+r3]
.height_loop:
    movq   mm0, [r4]
    movq   mm1, [r4+8]
    movd   mm2, [r4+16]
    movq   mm3, [r4+r3]
    movq   mm4, [r4+r3+8]
    movd   mm5, [r4+r3+16]
    pavgb  mm0, [r4+r2]
    pavgb  mm1, [r4+r2+8]
    pavgb  mm2, [r4+r2+16]
    pavgb  mm3, [r4+r6]
    pavgb  mm4, [r4+r6+8]
    pavgb  mm5, [r4+r6+16]
    lea    r4, [r4+r3*2]
    movq   [r0], mm0
    movq   [r0+8], mm1
    movd   [r0+16], mm2
    movq   [r0+r1], mm3
    movq   [r0+r1+8], mm4
    movd   [r0+r1+16], mm5
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET

cglobal pixel_avg2_w16_sse2, 6,7
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    movdqu xmm0, [r2]
    movdqu xmm2, [r2+r3]
    movdqu xmm1, [r2+r4]
    movdqu xmm3, [r2+r6]
    lea    r2, [r2+r3*2]
    pavgb  xmm0, xmm1
    pavgb  xmm2, xmm3
    movdqa [r0], xmm0
    movdqa [r0+r1], xmm2
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET

%macro AVG2_W20 1
cglobal pixel_avg2_w20_%1, 6,7
    sub    r2, r4
    lea    r6, [r2+r3]
.height_loop:
    movdqu xmm0, [r4]
    movdqu xmm2, [r4+r3]
%ifidn %1, sse2_misalign
    movd   mm4,  [r4+16]
    movd   mm5,  [r4+r3+16]
    pavgb  xmm0, [r4+r2]
    pavgb  xmm2, [r4+r6]
%else
    movdqu xmm1, [r4+r2]
    movdqu xmm3, [r4+r6]
    movd   mm4,  [r4+16]
    movd   mm5,  [r4+r3+16]
    pavgb  xmm0, xmm1
    pavgb  xmm2, xmm3
%endif
    pavgb  mm4,  [r4+r2+16]
    pavgb  mm5,  [r4+r6+16]
    lea    r4, [r4+r3*2]
    movdqa [r0], xmm0
    movd   [r0+16], mm4
    movdqa [r0+r1], xmm2
    movd   [r0+r1+16], mm5
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET
%endmacro

AVG2_W20 sse2
AVG2_W20 sse2_misalign

; Cacheline split code for processors with high latencies for loads
; split over cache lines.  See sad-a.asm for a more detailed explanation.
; This particular instance is complicated by the fact that src1 and src2
; can have different alignments.  For simplicity and code size, only the
; MMX cacheline workaround is used.  As a result, in the case of SSE2
; pixel_avg, the cacheline check functions calls the SSE2 version if there
; is no cacheline split, and the MMX workaround if there is.

%macro INIT_SHIFT 2
    and    eax, 7
    shl    eax, 3
    movd   %1, [sw_64]
    movd   %2, eax
    psubw  %1, %2
%endmacro

%macro AVG_CACHELINE_START 0
    %assign stack_offset 0
    INIT_SHIFT mm6, mm7
    mov    eax, r4m
    INIT_SHIFT mm4, mm5
    PROLOGUE 6,6
    and    r2, ~7
    and    r4, ~7
    sub    r4, r2
.height_loop:
%endmacro

%macro AVG_CACHELINE_LOOP 2
    movq   mm1, [r2+%1]
    movq   mm0, [r2+8+%1]
    movq   mm3, [r2+r4+%1]
    movq   mm2, [r2+r4+8+%1]
    psrlq  mm1, mm7
    psllq  mm0, mm6
    psrlq  mm3, mm5
    psllq  mm2, mm4
    por    mm0, mm1
    por    mm2, mm3
    pavgb  mm2, mm0
    %2 [r0+%1], mm2
%endmacro

%macro AVG_CACHELINE_FUNC 2
pixel_avg2_w%1_cache_mmx2:
    AVG_CACHELINE_START
    AVG_CACHELINE_LOOP 0, movq
%if %1>8
    AVG_CACHELINE_LOOP 8, movq
%if %1>16
    AVG_CACHELINE_LOOP 16, movd
%endif
%endif
    add    r2, r3
    add    r0, r1
    dec    r5d
    jg .height_loop
    REP_RET
%endmacro

%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
%if %1 == 12
;w12 isn't needed because w16 is just as fast if there's no cacheline split
%define cachesplit pixel_avg2_w16_cache_mmx2
%else
%define cachesplit pixel_avg2_w%1_cache_mmx2
%endif
cglobal pixel_avg2_w%1_cache%2_%3
    mov    eax, r2m
    and    eax, %2-1
    cmp    eax, (%2-%1-(%1 % 8))
%if %1==12||%1==20
    jbe pixel_avg2_w%1_%3
%else
    jb pixel_avg2_w%1_%3
%endif
%if 0 ; or %1==8 - but the extra branch seems too expensive
    ja cachesplit
%ifdef ARCH_X86_64
    test      r4b, 1
%else
    test byte r4m, 1
%endif
    jz pixel_avg2_w%1_%3
%else
    or     eax, r4m
    and    eax, 7
    jz pixel_avg2_w%1_%3
    mov    eax, r2m
%endif
%if mmsize==16 || (%1==8 && %2==64)
    AVG_CACHELINE_FUNC %1, %2
%else
    jmp cachesplit
%endif
%endmacro

INIT_MMX
AVG_CACHELINE_CHECK  8, 64, mmx2
AVG_CACHELINE_CHECK 12, 64, mmx2
%ifndef ARCH_X86_64
AVG_CACHELINE_CHECK 16, 64, mmx2
AVG_CACHELINE_CHECK 20, 64, mmx2
AVG_CACHELINE_CHECK  8, 32, mmx2
AVG_CACHELINE_CHECK 12, 32, mmx2
AVG_CACHELINE_CHECK 16, 32, mmx2
AVG_CACHELINE_CHECK 20, 32, mmx2
%endif
INIT_XMM
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2

; computed jump assumes this loop is exactly 48 bytes
%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
ALIGN 16
avg_w16_align%1_%2_ssse3:
%if %1==0 && %2==0
    movdqa  xmm1, [r2]
    pavgb   xmm1, [r2+r4]
    add    r2, r3
%elif %1==0
    movdqa  xmm1, [r2+r4+16]
    palignr xmm1, [r2+r4], %2
    pavgb   xmm1, [r2]
    add    r2, r3
%elif %2&15==0
    movdqa  xmm1, [r2+16]
    palignr xmm1, [r2], %1
    pavgb   xmm1, [r2+r4]
    add    r2, r3
%else
    movdqa  xmm1, [r2+16]
    movdqa  xmm2, [r2+r4+16]
    palignr xmm1, [r2], %1
    palignr xmm2, [r2+r4], %2&15
    add    r2, r3
    pavgb   xmm1, xmm2
%endif
    movdqa  [r0], xmm1
    add    r0, r1
    dec    r5d
    jg     avg_w16_align%1_%2_ssse3
    ret
%if %1==0
    times 13 db 0x90 ; make sure the first ones don't end up short
%endif
%endmacro

cglobal pixel_avg2_w16_cache64_ssse3
%if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
    mov   eax, r2m
    and   eax, 0x3f
    cmp   eax, 0x30
    jb x264_pixel_avg2_w16_sse2
    or    eax, r4m
    and   eax, 7
    jz x264_pixel_avg2_w16_sse2
%endif
    PROLOGUE 6, 7
    lea    r6, [r4+r2]
    and    r4, ~0xf
    and    r6, 0x1f
    and    r2, ~0xf
    lea    r6, [r6*3]    ;(offset + align*2)*3
    sub    r4, r2
    shl    r6, 4         ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
    lea   r11, [avg_w16_addr]
    add    r6, r11
%else
    lea    r6, [avg_w16_addr + r6]
%endif
%ifdef UNIX64
    jmp    r6
%else
    call   r6
    RET
%endif

%assign j 0
%assign k 1
%rep 16
AVG16_CACHELINE_LOOP_SSSE3 j, j
AVG16_CACHELINE_LOOP_SSSE3 j, k
%assign j j+1
%assign k k+1
%endrep
%endif ; !HIGH_BIT_DEPTH

;=============================================================================
; pixel copy
;=============================================================================

%macro COPY1 2
    movu  m0, [r2]
    movu  m1, [r2+r3]
    movu  m2, [r2+r3*2]
    movu  m3, [r2+%2]
    mova  [r0],      m0
    mova  [r0+r1],   m1
    mova  [r0+r1*2], m2
    mova  [r0+%1],   m3
%endmacro

%macro COPY2 2-4 0, 1
    movu  m0, [r2+%3*mmsize]
    movu  m1, [r2+%4*mmsize]
    movu  m2, [r2+r3+%3*mmsize]
    movu  m3, [r2+r3+%4*mmsize]
    movu  m4, [r2+r3*2+%3*mmsize]
    movu  m5, [r2+r3*2+%4*mmsize]
    movu  m6, [r2+%2+%3*mmsize]
    movu  m7, [r2+%2+%4*mmsize]
    mova  [r0+%3*mmsize],      m0
    mova  [r0+%4*mmsize],      m1
    mova  [r0+r1+%3*mmsize],   m2
    mova  [r0+r1+%4*mmsize],   m3
    mova  [r0+r1*2+%3*mmsize], m4
    mova  [r0+r1*2+%4*mmsize], m5
    mova  [r0+%1+%3*mmsize],   m6
    mova  [r0+%1+%4*mmsize],   m7
%endmacro

%macro COPY4 2
    COPY2 %1, %2, 0, 1
    COPY2 %1, %2, 2, 3
%endmacro

;-----------------------------------------------------------------------------
; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
;                  uint8_t *src, int i_src_stride, int i_height )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal mc_copy_w4_mmx, 4,6
    FIX_STRIDES r1, r3
    cmp dword r4m, 4
    lea     r5, [r3*3]
    lea     r4, [r1*3]
    je .end
%ifndef HIGH_BIT_DEPTH
    %define mova movd
    %define movu movd
%endif
    COPY1   r4, r5
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
.end:
    COPY1   r4, r5
    RET

%macro MC_COPY 1
%assign %%w %1*SIZEOF_PIXEL/mmsize
%if %%w > 0
cglobal mc_copy_w%1, 5,7,8*(%%w/2)
    FIX_STRIDES r1, r3
    lea     r6, [r3*3]
    lea     r5, [r1*3]
.height_loop:
    COPY %+ %%w r5, r6
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
    sub    r4d, 4
    jg .height_loop
    REP_RET
%endif
%endmacro

INIT_MMX mmx
MC_COPY  8
MC_COPY 16
INIT_XMM sse2
MC_COPY  8
MC_COPY 16
INIT_XMM aligned, sse2
MC_COPY 16



;=============================================================================
; prefetch
;=============================================================================
; assumes 64 byte cachelines
; FIXME doesn't cover all pixels in high depth and/or 4:4:4

;-----------------------------------------------------------------------------
; void prefetch_fenc( pixel *pix_y, int stride_y,
;                     pixel *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------

%macro PREFETCH_FENC 1
%ifdef ARCH_X86_64
cglobal prefetch_fenc_%1, 5,5
    FIX_STRIDES r1d, r3d
    and    r4d, 3
    mov    eax, r4d
    imul   r4d, r1d
    lea    r0,  [r0+r4*4+64*SIZEOF_PIXEL]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]
    lea    r0,  [r0+r1*2]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]

    imul   eax, r3d
    lea    r2,  [r2+rax*2+64*SIZEOF_PIXEL]
    prefetcht0  [r2]
    prefetcht0  [r2+r3]
%ifidn %1, 422
    lea    r2,  [r2+r3*2]
    prefetcht0  [r2]
    prefetcht0  [r2+r3]
%endif
    RET

%else
cglobal prefetch_fenc_%1, 0,3
    mov    r2, r4m
    mov    r1, r1m
    mov    r0, r0m
    FIX_STRIDES r1
    and    r2, 3
    imul   r2, r1
    lea    r0, [r0+r2*4+64*SIZEOF_PIXEL]
    prefetcht0 [r0]
    prefetcht0 [r0+r1]
    lea    r0, [r0+r1*2]
    prefetcht0 [r0]
    prefetcht0 [r0+r1]

    mov    r2, r4m
    mov    r1, r3m
    mov    r0, r2m
    FIX_STRIDES r1
    and    r2, 3
    imul   r2, r1
    lea    r0, [r0+r2*2+64*SIZEOF_PIXEL]
    prefetcht0 [r0]
    prefetcht0 [r0+r1]
%ifidn %1, 422
    lea    r0,  [r0+r1*2]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]
%endif
    ret
%endif ; ARCH_X86_64
%endmacro

INIT_MMX mmx2
PREFETCH_FENC 420
PREFETCH_FENC 422

;-----------------------------------------------------------------------------
; void prefetch_ref( pixel *pix, int stride, int parity )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal prefetch_ref, 3,3
    FIX_STRIDES r1d
    dec    r2d
    and    r2d, r1d
    lea    r0,  [r0+r2*8+64*SIZEOF_PIXEL]
    lea    r2,  [r1*3]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]
    prefetcht0  [r0+r1*2]
    prefetcht0  [r0+r2]
    lea    r0,  [r0+r1*4]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]
    prefetcht0  [r0+r1*2]
    prefetcht0  [r0+r2]
    RET



;=============================================================================
; chroma MC
;=============================================================================

%ifdef ARCH_X86_64
    DECLARE_REG_TMP 10,11,6
%else
    DECLARE_REG_TMP 0,1,2
%endif

%macro MC_CHROMA_START 0
    movifnidn r3,  r3mp
    movifnidn r4d, r4m
    movifnidn r5d, r5m
    movifnidn t2d, r6m
    mov       t0d, t2d
    mov       t1d, r5d
    sar       t0d, 3
    sar       t1d, 3
    imul      t0d, r4d
    lea       t0d, [t0+t1*2]
    FIX_STRIDES t0d
    movsxdifnidn t0, t0d
    add       r3,  t0            ; src += (dx>>3) + (dy>>3) * src_stride
%endmacro

%ifdef HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED 4
    movu       %1, [%4+0]
    movu       %2, [%4+4]
    punpckhwd  %3, %1, %2
    punpcklwd  %1, %2
%if mmsize == 8
    mova       %2, %1
    punpcklwd  %1, %3
    punpckhwd  %2, %3
%else
    shufps     %2, %1, %3, q3131
    shufps     %1, %3, q2020
%endif
%endmacro
%else ; !HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED 3
%if mmsize == 8 || cpuflag(misalign)
    punpcklwd  %1, %3
%else
    movh       %2, %3
    punpcklwd  %1, %2
%endif
%endmacro
%endif ; HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
;                 uint8_t *src, int src_stride,
;                 int dx, int dy,
;                 int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 0
cglobal mc_chroma, 0,6
    MC_CHROMA_START
    FIX_STRIDES r4
    and       r5d, 7
%ifdef ARCH_X86_64
    jz .mc1dy
%endif
    and       t2d, 7
%ifdef ARCH_X86_64
    jz .mc1dx
%endif
    shl       r5d, 16
    add       t2d, r5d
    mov       t0d, t2d
    shl       t2d, 8
    sub       t2d, t0d
    add       t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
    cmp dword r7m, 4
%if mmsize==8
.skip_prologue:
%else
    jl mc_chroma_mmx2 %+ .skip_prologue
    WIN64_SPILL_XMM 9
%endif
    movd       m5, t2d
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
    movifnidn r2d, r2m
    movifnidn r5d, r8m
    pxor       m6, m6
    punpcklbw  m5, m6
%if mmsize==8
    pshufw     m7, m5, q3232
    pshufw     m6, m5, q0000
    pshufw     m5, m5, q1111
    jge .width4
%else
%ifdef WIN64
    cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
%endif
    pshufd     m7, m5, q1111
    punpcklwd  m5, m5
    pshufd     m6, m5, q0000
    pshufd     m5, m5, q1111
    jg .width8
%endif
%ifdef HIGH_BIT_DEPTH
    add        r2, r2
    UNPACK_UNALIGNED m0, m1, m2, r3
%else
    movu       m0, [r3]
    UNPACK_UNALIGNED m0, m1, [r3+2]
    mova       m1, m0
    pand       m0, [pw_00ff]
    psrlw      m1, 8
%endif ; HIGH_BIT_DEPTH
    pmaddwd    m0, m7
    pmaddwd    m1, m7
    packssdw   m0, m1
    SWAP        3, 0
ALIGN 4
.loop2:
%ifdef HIGH_BIT_DEPTH
    UNPACK_UNALIGNED m0, m1, m2, r3+r4
    pmullw     m3, m6
%else ; !HIGH_BIT_DEPTH
    movu       m0, [r3+r4]
    UNPACK_UNALIGNED m0, m1, [r3+r4+2]
    pmullw     m3, m6
    mova       m1, m0
    pand       m0, [pw_00ff]
    psrlw      m1, 8
%endif ; HIGH_BIT_DEPTH
    pmaddwd    m0, m7
    pmaddwd    m1, m7
    mova       m2, [pw_32]
    packssdw   m0, m1
    paddw      m2, m3
    mova       m3, m0
    pmullw     m0, m5
    paddw      m0, m2
    psrlw      m0, 6
%ifdef HIGH_BIT_DEPTH
    movh     [r0], m0
%if mmsize == 8
    psrlq      m0, 32
    movh     [r1], m0
%else
    movhps   [r1], m0
%endif
%else ; !HIGH_BIT_DEPTH
    packuswb   m0, m0
    movd     [r0], m0
%if mmsize==8
    psrlq      m0, 16
%else
    psrldq     m0, 4
%endif
    movd     [r1], m0
%endif ; HIGH_BIT_DEPTH
    add        r3, r4
    add        r0, r2
    add        r1, r2
    dec       r5d
    jg .loop2
    REP_RET

%if mmsize==8
.width4:
%ifdef ARCH_X86_64
    mov        t0, r0
    mov        t1, r1
    mov        t2, r3
    %define multy0 [rsp-8]
    mova    multy0, m5
%else
    mov       r3m, r3
    %define multy0 r4m
    mova    multy0, m5
%endif
%else
.width8:
%ifdef ARCH_X86_64
    %define multy0 m8
    SWAP        8, 5
%else
    %define multy0 r0m
    mova    multy0, m5
%endif
%endif
    FIX_STRIDES r2
.loopx:
%ifdef HIGH_BIT_DEPTH
    UNPACK_UNALIGNED m0, m2, m4, r3
    UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
%else
    movu       m0, [r3]
    movu       m1, [r3+mmsize/2]
    UNPACK_UNALIGNED m0, m2, [r3+2]
    UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
    psrlw      m2, m0, 8
    psrlw      m3, m1, 8
    pand       m0, [pw_00ff]
    pand       m1, [pw_00ff]
%endif
    pmaddwd    m0, m7
    pmaddwd    m2, m7
    pmaddwd    m1, m7
    pmaddwd    m3, m7
    packssdw   m0, m2
    packssdw   m1, m3
    SWAP        4, 0
    SWAP        5, 1
    add        r3, r4
ALIGN 4
.loop4:
%ifdef HIGH_BIT_DEPTH
    UNPACK_UNALIGNED m0, m1, m2, r3
    pmaddwd    m0, m7
    pmaddwd    m1, m7
    packssdw   m0, m1
    UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
    pmaddwd    m1, m7
    pmaddwd    m2, m7
    packssdw   m1, m2
%else ; !HIGH_BIT_DEPTH
    movu       m0, [r3]
    movu       m1, [r3+mmsize/2]
    UNPACK_UNALIGNED m0, m2, [r3+2]
    UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
    psrlw      m2, m0, 8
    psrlw      m3, m1, 8
    pand       m0, [pw_00ff]
    pand       m1, [pw_00ff]
    pmaddwd    m0, m7
    pmaddwd    m2, m7
    pmaddwd    m1, m7
    pmaddwd    m3, m7
    packssdw   m0, m2
    packssdw   m1, m3
%endif ; HIGH_BIT_DEPTH
    pmullw     m4, m6
    pmullw     m5, m6
    mova       m2, [pw_32]
    paddw      m3, m2, m5
    paddw      m2, m4
    mova       m4, m0
    mova       m5, m1
    pmullw     m0, multy0
    pmullw     m1, multy0
    paddw      m0, m2
    paddw      m1, m3
    psrlw      m0, 6
    psrlw      m1, 6
%ifdef HIGH_BIT_DEPTH
    movh     [r0], m0
    movh     [r0+mmsize/2], m1
%if mmsize==8
    psrlq      m0, 32
    psrlq      m1, 32
    movh     [r1], m0
    movh     [r1+mmsize/2], m1
%else
    movhps   [r1], m0
    movhps   [r1+mmsize/2], m1
%endif
%else ; !HIGH_BIT_DEPTH
    packuswb   m0, m1
%if mmsize==8
    pshufw     m1, m0, q0020
    pshufw     m0, m0, q0031
    movd     [r0], m1
    movd     [r1], m0
%else
    pshufd     m0, m0, q3120
    movq     [r0], m0
    movhps   [r1], m0
%endif
%endif ; HIGH_BIT_DEPTH
    add        r3, r4
    add        r0, r2
    add        r1, r2
    dec       r5d
    jg .loop4
%if mmsize!=8
    REP_RET
%else
    sub dword r7m, 4
    jg .width8
    REP_RET
.width8:
%ifdef ARCH_X86_64
    lea        r3, [t2+8*SIZEOF_PIXEL]
    lea        r0, [t0+4*SIZEOF_PIXEL]
    lea        r1, [t1+4*SIZEOF_PIXEL]
%else
    mov        r3, r3m
    mov        r0, r0m
    mov        r1, r1m
    add        r3, 8*SIZEOF_PIXEL
    add        r0, 4*SIZEOF_PIXEL
    add        r1, 4*SIZEOF_PIXEL
%endif
    mov       r5d, r8m
    jmp .loopx
%endif

%ifdef ARCH_X86_64 ; too many regs for x86_32
    RESET_MM_PERMUTATION
%ifdef WIN64
%if xmm_regs_used > 6
    %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
    %assign xmm_regs_used 6
%endif
%endif
.mc1dy:
    and       t2d, 7
    movd       m5, t2d
    mov       r6d, r4d ; pel_offset = dx ? 2 : src_stride
    jmp .mc1d
.mc1dx:
    movd       m5, r5d
    mov       r6d, 2*SIZEOF_PIXEL
.mc1d:
%ifdef HIGH_BIT_DEPTH
%if mmsize == 16
    WIN64_SPILL_XMM 8
%endif
%endif
    mova       m4, [pw_8]
    SPLATW     m5, m5
    psubw      m4, m5
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
    movifnidn r2d, r2m
    FIX_STRIDES r2
    movifnidn r5d, r8m
    cmp dword r7m, 4
    jg .mc1d_w8
    mov       r10, r2
    mov       r11, r4
%if mmsize!=8
    shr       r5d, 1
%endif
.loop1d_w4:
%ifdef HIGH_BIT_DEPTH
%if mmsize == 8
    movq       m0, [r3+0]
    movq       m2, [r3+8]
    movq       m1, [r3+r6+0]
    movq       m3, [r3+r6+8]
%else
    movu       m0, [r3]
    movu       m1, [r3+r6]
    add        r3, r11
    movu       m2, [r3]
    movu       m3, [r3+r6]
%endif
    SBUTTERFLY wd, 0, 2, 6
    SBUTTERFLY wd, 1, 3, 7
    SBUTTERFLY wd, 0, 2, 6
    SBUTTERFLY wd, 1, 3, 7
%if mmsize == 16
    SBUTTERFLY wd, 0, 2, 6
    SBUTTERFLY wd, 1, 3, 7
%endif
%else ; !HIGH_BIT_DEPTH
    movq       m0, [r3]
    movq       m1, [r3+r6]
%if mmsize!=8
    add        r3, r11
    movhps     m0, [r3]
    movhps     m1, [r3+r6]
%endif
    psrlw      m2, m0, 8
    psrlw      m3, m1, 8
    pand       m0, [pw_00ff]
    pand       m1, [pw_00ff]
%endif ; HIGH_BIT_DEPTH
    pmullw     m0, m4
    pmullw     m1, m5
    pmullw     m2, m4
    pmullw     m3, m5
    paddw      m0, [pw_4]
    paddw      m2, [pw_4]
    paddw      m0, m1
    paddw      m2, m3
    psrlw      m0, 3
    psrlw      m2, 3
%ifdef HIGH_BIT_DEPTH
%if mmsize == 8
    xchg       r4, r11
    xchg       r2, r10
%endif
    movq     [r0], m0
    movq     [r1], m2
%if mmsize == 16
    add        r0, r10
    add        r1, r10
    movhps   [r0], m0
    movhps   [r1], m2
%endif
%else ; !HIGH_BIT_DEPTH
    packuswb   m0, m2
%if mmsize==8
    xchg       r4, r11
    xchg       r2, r10
    movd     [r0], m0
    psrlq      m0, 32
    movd     [r1], m0
%else
    movhlps    m1, m0
    movd     [r0], m0
    movd     [r1], m1
    add        r0, r10
    add        r1, r10
    psrldq     m0, 4
    psrldq     m1, 4
    movd     [r0], m0
    movd     [r1], m1
%endif
%endif ; HIGH_BIT_DEPTH
    add        r3, r4
    add        r0, r2
    add        r1, r2
    dec       r5d
    jg .loop1d_w4
    REP_RET
.mc1d_w8:
    sub       r2, 4*SIZEOF_PIXEL
    sub       r4, 8*SIZEOF_PIXEL
    mov      r10, 4*SIZEOF_PIXEL
    mov      r11, 8*SIZEOF_PIXEL
%if mmsize==8
    shl       r5d, 1
%endif
    jmp .loop1d_w4
%endif ; ARCH_X86_64
%endmacro ; MC_CHROMA


%macro MC_CHROMA_SSSE3 0
cglobal mc_chroma, 0,6,9
    MC_CHROMA_START
    and       r5d, 7
    and       t2d, 7
    mov       t0d, r5d
    shl       t0d, 8
    sub       t0d, r5d
    mov       r5d, 8
    add       t0d, 8
    sub       r5d, t2d
    imul      t2d, t0d ; (x*255+8)*y
    imul      r5d, t0d ; (x*255+8)*(8-y)
    movd       m6, t2d
    movd       m7, r5d
%if cpuflag(cache64)
    mov       t0d, r3d
    and       t0d, 7
%ifdef PIC
    lea        t1, [ch_shuf_adj]
    movddup    m5, [t1 + t0*4]
%else
    movddup    m5, [ch_shuf_adj + t0*4]
%endif
    paddb      m5, [ch_shuf]
    and        r3, ~7
%else
    mova       m5, [ch_shuf]
%endif
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
    movifnidn r2d, r2m
    movifnidn r5d, r8m
    SPLATW     m6, m6
    SPLATW     m7, m7
    cmp dword r7m, 4
    jg .width8
    movu       m0, [r3]
    pshufb     m0, m5
.loop4:
    movu       m1, [r3+r4]
    pshufb     m1, m5
    movu       m3, [r3+r4*2]
    pshufb     m3, m5
    mova       m4, m3
    pmaddubsw  m0, m7
    pmaddubsw  m2, m1, m7
    pmaddubsw  m1, m6
    pmaddubsw  m3, m6
    paddw      m0, [pw_32]
    paddw      m2, [pw_32]
    paddw      m1, m0
    paddw      m3, m2
    mova       m0, m4
    psrlw      m1, 6
    psrlw      m3, 6
    packuswb   m1, m3
    movhlps    m3, m1
    movd     [r0], m1
    movd  [r0+r2], m3
    psrldq     m1, 4
    psrldq     m3, 4
    movd     [r1], m1
    movd  [r1+r2], m3
    lea        r3, [r3+r4*2]
    lea        r0, [r0+r2*2]
    lea        r1, [r1+r2*2]
    sub       r5d, 2
    jg .loop4
    REP_RET

.width8:
    movu       m0, [r3]
    pshufb     m0, m5
    movu       m1, [r3+8]
    pshufb     m1, m5
%ifdef ARCH_X86_64
    SWAP        8, 6
    %define  mult1 m8
%else
    mova      r0m, m6
    %define  mult1 r0m
%endif
.loop8:
    movu       m2, [r3+r4]
    pshufb     m2, m5
    movu       m3, [r3+r4+8]
    pshufb     m3, m5
    mova       m4, m2
    mova       m6, m3
    pmaddubsw  m0, m7
    pmaddubsw  m1, m7
    pmaddubsw  m2, mult1
    pmaddubsw  m3, mult1
    paddw      m0, [pw_32]
    paddw      m1, [pw_32]
    paddw      m0, m2
    paddw      m1, m3
    psrlw      m0, 6
    psrlw      m1, 6
    packuswb   m0, m1
    pshufd     m0, m0, q3120
    movq     [r0], m0
    movhps   [r1], m0

    movu       m2, [r3+r4*2]
    pshufb     m2, m5
    movu       m3, [r3+r4*2+8]
    pshufb     m3, m5
    mova       m0, m2
    mova       m1, m3
    pmaddubsw  m4, m7
    pmaddubsw  m6, m7
    pmaddubsw  m2, mult1
    pmaddubsw  m3, mult1
    paddw      m4, [pw_32]
    paddw      m6, [pw_32]
    paddw      m2, m4
    paddw      m3, m6
    psrlw      m2, 6
    psrlw      m3, 6
    packuswb   m2, m3
    pshufd     m2, m2, q3120
    movq   [r0+r2], m2
    movhps [r1+r2], m2
    lea        r3, [r3+r4*2]
    lea        r0, [r0+r2*2]
    lea        r1, [r1+r2*2]
    sub       r5d, 2
    jg .loop8
    REP_RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_MMX mmx2
MC_CHROMA
INIT_XMM sse2
MC_CHROMA
INIT_XMM avx
MC_CHROMA
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
MC_CHROMA
INIT_XMM sse2, misalign
MC_CHROMA
INIT_XMM sse2
MC_CHROMA
INIT_XMM ssse3
MC_CHROMA_SSSE3
INIT_XMM ssse3, cache64
MC_CHROMA_SSSE3
INIT_XMM avx
MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
%endif ; HIGH_BIT_DEPTH
x264-snapshot-20120103-2245-stable/common/x86/deblock-a.asm0000644000175000001440000015244711700673342022146 0ustar  videolanusers;*****************************************************************************
;* deblock-a.asm: x86 deblocking
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*          Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA

transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15

SECTION .text

cextern pb_0
cextern pb_1
cextern pb_3
cextern pb_a1
cextern pw_2
cextern pw_4
cextern pw_00ff
cextern pw_pixel_max

%ifdef HIGH_BIT_DEPTH
; out: %4 = |%1-%2|-%3
; clobbers: %5
%macro ABS_SUB 5
    psubusw %5, %2, %1
    psubusw %4, %1, %2
    por     %4, %5
    psubw   %4, %3
%endmacro

; out: %4 = |%1-%2|<%3
%macro DIFF_LT   5
    psubusw %4, %2, %1
    psubusw %5, %1, %2
    por     %5, %4 ; |%1-%2|
    pxor    %4, %4
    psubw   %5, %3 ; |%1-%2|-%3
    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
%endmacro

%macro LOAD_AB 4
    movd       %1, %3
    movd       %2, %4
    SPLATW     %1, %1
    SPLATW     %2, %2
%endmacro

; in:  %2=tc reg
; out: %1=splatted tc
%macro LOAD_TC 2
%if mmsize == 8
    pshufw      %1, [%2-1], 0
%else
    movd        %1, [%2]
    punpcklbw   %1, %1
    pshuflw     %1, %1, q1100
    pshufd      %1, %1, q1100
%endif
    psraw       %1, 8
%endmacro

; in: %1=p1, %2=p0, %3=q0, %4=q1
;     %5=alpha, %6=beta, %7-%9=tmp
; out: %7=mask
%macro LOAD_MASK 9
    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
    pand        %8, %9
    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
    pxor        %7, %7
    pand        %8, %9
    pcmpgtw     %7, %8
%endmacro

; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', m2=q0'
%macro DEBLOCK_P0_Q0 7
    psubw   %3, %4
    pxor    %7, %7
    paddw   %3, [pw_4]
    psubw   %7, %5
    psubw   %6, %2, %1
    psllw   %6, 2
    paddw   %3, %6
    psraw   %3, 3
    mova    %6, [pw_pixel_max]
    CLIPW   %3, %7, %5
    pxor    %7, %7
    paddw   %1, %3
    psubw   %2, %3
    CLIPW   %1, %7, %6
    CLIPW   %2, %7, %6
%endmacro

; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
%macro LUMA_Q1 6
    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
    paddw       %1, %6
    pxor        %6, %6
    psraw       %1, 1
    psubw       %6, %5
    psubw       %1, %2
    CLIPW       %1, %6, %5
    paddw       %1, %2
%endmacro

%macro LUMA_DEBLOCK_ONE 3
    DIFF_LT     m5, %1, bm, m4, m6
    pxor        m6, m6
    mova        %3, m4
    pcmpgtw     m6, tcm
    pand        m4, tcm
    pandn       m6, m7
    pand        m4, m6
    LUMA_Q1 m5, %2, m1, m2, m4, m6
%endmacro

%macro LUMA_H_STORE 2
%if mmsize == 8
    movq        [r0-4], m0
    movq        [r0+r1-4], m1
    movq        [r0+r1*2-4], m2
    movq        [r0+%2-4], m3
%else
    movq        [r0-4], m0
    movhps      [r0+r1-4], m0
    movq        [r0+r1*2-4], m1
    movhps      [%1-4], m1
    movq        [%1+r1-4], m2
    movhps      [%1+r1*2-4], m2
    movq        [%1+%2-4], m3
    movhps      [%1+r1*4-4], m3
%endif
%endmacro

%macro DEBLOCK_LUMA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma, 5,5,8
    %assign pad 5*mmsize+12-(stack_offset&15)
    %define tcm [rsp]
    %define ms1 [rsp+mmsize]
    %define ms2 [rsp+mmsize*2]
    %define am  [rsp+mmsize*3]
    %define bm  [rsp+mmsize*4]
    SUB        rsp, pad
    add         r1, r1
    LOAD_AB     m4, m5, r2, r3
    mov         r3, 32/mmsize
    mov         r2, r0
    sub         r0, r1
    mova        am, m4
    sub         r0, r1
    mova        bm, m5
    sub         r0, r1
.loop:
    mova        m0, [r0+r1]
    mova        m1, [r0+r1*2]
    mova        m2, [r2]
    mova        m3, [r2+r1]

    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
    LOAD_TC     m6, r4
    mova       tcm, m6

    mova        m5, [r0]
    LUMA_DEBLOCK_ONE m1, m0, ms1
    mova   [r0+r1], m5

    mova        m5, [r2+r1*2]
    LUMA_DEBLOCK_ONE m2, m3, ms2
    mova   [r2+r1], m5

    pxor        m5, m5
    mova        m6, tcm
    pcmpgtw     m5, tcm
    psubw       m6, ms1
    pandn       m5, m7
    psubw       m6, ms2
    pand        m5, m6
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
    mova [r0+r1*2], m1
    mova      [r2], m2

    add         r0, mmsize
    add         r2, mmsize
    add         r4, mmsize/8
    dec         r3
    jg .loop
    ADD         rsp, pad
    RET

cglobal deblock_h_luma, 5,6,8
    %assign pad 7*mmsize+12-(stack_offset&15)
    %define tcm [rsp]
    %define ms1 [rsp+mmsize]
    %define ms2 [rsp+mmsize*2]
    %define p1m [rsp+mmsize*3]
    %define p2m [rsp+mmsize*4]
    %define am  [rsp+mmsize*5]
    %define bm  [rsp+mmsize*6]
    SUB        rsp, pad
    add         r1, r1
    LOAD_AB     m4, m5, r2, r3
    mov         r3, r1
    mova        am, m4
    add         r3, r1
    mov         r5, 32/mmsize
    mova        bm, m5
    add         r3, r1
%if mmsize == 16
    mov         r2, r0
    add         r2, r3
%endif
.loop:
%if mmsize == 8
    movq        m2, [r0-8]     ; y q2 q1 q0
    movq        m7, [r0+0]
    movq        m5, [r0+r1-8]
    movq        m3, [r0+r1+0]
    movq        m0, [r0+r1*2-8]
    movq        m6, [r0+r1*2+0]
    movq        m1, [r0+r3-8]
    TRANSPOSE4x4W 2, 5, 0, 1, 4
    SWAP         2, 7
    movq        m7, [r0+r3]
    TRANSPOSE4x4W 2, 3, 6, 7, 4
%else
    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
    movu        m0, [r0+r1-8]
    movu        m2, [r0+r1*2-8]
    movu        m3, [r2-8]
    TRANSPOSE4x4W 5, 0, 2, 3, 6
    mova       tcm, m3

    movu        m4, [r2+r1-8]
    movu        m1, [r2+r1*2-8]
    movu        m3, [r2+r3-8]
    movu        m7, [r2+r1*4-8]
    TRANSPOSE4x4W 4, 1, 3, 7, 6

    mova        m6, tcm
    punpcklqdq  m6, m7
    punpckhqdq  m5, m4
    SBUTTERFLY qdq, 0, 1, 7
    SBUTTERFLY qdq, 2, 3, 7
%endif

    mova       p2m, m6
    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
    LOAD_TC     m6, r4
    mova       tcm, m6

    LUMA_DEBLOCK_ONE m1, m0, ms1
    mova       p1m, m5

    mova        m5, p2m
    LUMA_DEBLOCK_ONE m2, m3, ms2
    mova       p2m, m5

    pxor        m5, m5
    mova        m6, tcm
    pcmpgtw     m5, tcm
    psubw       m6, ms1
    pandn       m5, m7
    psubw       m6, ms2
    pand        m5, m6
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
    mova        m0, p1m
    mova        m3, p2m
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    LUMA_H_STORE r2, r3

    add         r4, mmsize/8
    lea         r0, [r0+r1*(mmsize/2)]
    lea         r2, [r2+r1*(mmsize/2)]
    dec         r5
    jg .loop
    ADD        rsp, pad
    RET
%endmacro

%ifdef ARCH_X86_64
; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
;      m12=alpha, m13=beta
; out: m0=p1', m3=q1', m1=p0', m2=q0'
; clobbers: m4, m5, m6, m7, m10, m11, m14
%macro DEBLOCK_LUMA_INTER_SSE2 0
    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
    LOAD_TC     m6, r4
    DIFF_LT     m8, m1, m13, m10, m4
    DIFF_LT     m9, m2, m13, m11, m4
    pand        m6, m7

    mova       m14, m6
    pxor        m4, m4
    pcmpgtw     m6, m4
    pand        m6, m14

    mova        m5, m10
    pand        m5, m6
    LUMA_Q1 m8, m0, m1, m2, m5, m4

    mova        m5, m11
    pand        m5, m6
    LUMA_Q1 m9, m3, m1, m2, m5, m4

    pxor        m4, m4
    psubw       m6, m10
    pcmpgtw     m4, m14
    pandn       m4, m7
    psubw       m6, m11
    pand        m4, m6
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6

    SWAP         0, 8
    SWAP         3, 9
%endmacro

%macro DEBLOCK_LUMA_64 0
cglobal deblock_v_luma, 5,5,15
    %define p2 m8
    %define p1 m0
    %define p0 m1
    %define q0 m2
    %define q1 m3
    %define q2 m9
    %define mask0 m7
    %define mask1 m10
    %define mask2 m11
    add         r1, r1
    LOAD_AB    m12, m13, r2, r3
    mov         r2, r0
    sub         r0, r1
    sub         r0, r1
    sub         r0, r1
    mov         r3, 2
.loop:
    mova        p2, [r0]
    mova        p1, [r0+r1]
    mova        p0, [r0+r1*2]
    mova        q0, [r2]
    mova        q1, [r2+r1]
    mova        q2, [r2+r1*2]
    DEBLOCK_LUMA_INTER_SSE2
    mova   [r0+r1], p1
    mova [r0+r1*2], p0
    mova      [r2], q0
    mova   [r2+r1], q1
    add         r0, mmsize
    add         r2, mmsize
    add         r4, 2
    dec         r3
    jg .loop
    REP_RET

cglobal deblock_h_luma, 5,7,15
    add         r1, r1
    LOAD_AB    m12, m13, r2, r3
    mov         r2, r1
    add         r2, r1
    add         r2, r1
    mov         r5, r0
    add         r5, r2
    mov         r6, 2
.loop:
    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
    movu        m0, [r0+r1-8]
    movu        m2, [r0+r1*2-8]
    movu        m9, [r5-8]
    movu        m5, [r5+r1-8]
    movu        m1, [r5+r1*2-8]
    movu        m3, [r5+r2-8]
    movu        m7, [r5+r1*4-8]

    TRANSPOSE4x4W 8, 0, 2, 9, 10
    TRANSPOSE4x4W 5, 1, 3, 7, 10

    punpckhqdq  m8, m5
    SBUTTERFLY qdq, 0, 1, 10
    SBUTTERFLY qdq, 2, 3, 10
    punpcklqdq  m9, m7

    DEBLOCK_LUMA_INTER_SSE2

    TRANSPOSE4x4W 0, 1, 2, 3, 4
    LUMA_H_STORE r5, r2
    add         r4, 2
    lea         r0, [r0+r1*8]
    lea         r5, [r5+r1*8]
    dec         r6
    jg .loop
    REP_RET
%endmacro

INIT_XMM sse2
DEBLOCK_LUMA_64
INIT_XMM avx
DEBLOCK_LUMA_64
%endif

%macro SWAPMOVA 2
%ifid %1
    SWAP %1, %2
%else
    mova %1, %2
%endif
%endmacro

; in: t0-t2: tmp registers
;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
%ifdef ARCH_X86_64
    paddw     t0, %3, %2
    mova      t2, %4
    paddw     t2, %3
%else
    mova      t0, %3
    mova      t2, %4
    paddw     t0, %2
    paddw     t2, %3
%endif
    paddw     t0, %1
    paddw     t2, t2
    paddw     t0, %5
    paddw     t2, %9
    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)

    psrlw     t2, 3
    psrlw     t1, t0, 2
    psubw     t2, %3
    psubw     t1, %2
    pand      t2, %8
    pand      t1, %8
    paddw     t2, %3
    paddw     t1, %2
    SWAPMOVA %11, t1

    psubw     t1, t0, %3
    paddw     t0, t0
    psubw     t1, %5
    psubw     t0, %3
    paddw     t1, %6
    paddw     t1, %2
    paddw     t0, %6
    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3

    pxor      t0, t1
    pxor      t1, %1
    pand      t0, %8
    pand      t1, %7
    pxor      t0, t1
    pxor      t0, %1
    SWAPMOVA %10, t0
    SWAPMOVA %12, t2
%endmacro

%macro LUMA_INTRA_INIT 1
    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
    %define t0 m4
    %define t1 m5
    %define t2 m6
    %define t3 m7
    %assign i 4
%rep %1
    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
    %assign i i+1
%endrep
    SUB    rsp, pad
    add     r1, r1
%endmacro

; in: %1-%3=tmp, %4=p2, %5=q2
%macro LUMA_INTRA_INTER 5
    LOAD_AB t0, t1, r2d, r3d
    mova    %1, t0
    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
%ifdef ARCH_X86_64
    mova    %2, t0        ; mask0
    psrlw   t3, %1, 2
%else
    mova    t3, %1
    mova    %2, t0        ; mask0
    psrlw   t3, 2
%endif
    paddw   t3, [pw_2]    ; alpha/4+2
    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
    pand    t2, %2
    mova    t3, %5        ; q2
    mova    %1, t2        ; mask1
    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
    pand    t2, %1
    mova    t3, %4        ; p2
    mova    %3, t2        ; mask1q
    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
    pand    t2, %1
    mova    %1, t2        ; mask1p
%endmacro

%macro LUMA_H_INTRA_LOAD 0
%if mmsize == 8
    movu    t0, [r0-8]
    movu    t1, [r0+r1-8]
    movu    m0, [r0+r1*2-8]
    movu    m1, [r0+r4-8]
    TRANSPOSE4x4W 4, 5, 0, 1, 2
    mova    t4, t0        ; p3
    mova    t5, t1        ; p2

    movu    m2, [r0]
    movu    m3, [r0+r1]
    movu    t0, [r0+r1*2]
    movu    t1, [r0+r4]
    TRANSPOSE4x4W 2, 3, 4, 5, 6
    mova    t6, t0        ; q2
    mova    t7, t1        ; q3
%else
    movu    t0, [r0-8]
    movu    t1, [r0+r1-8]
    movu    m0, [r0+r1*2-8]
    movu    m1, [r0+r5-8]
    movu    m2, [r4-8]
    movu    m3, [r4+r1-8]
    movu    t2, [r4+r1*2-8]
    movu    t3, [r4+r5-8]
    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
    mova    t4, t0        ; p3
    mova    t5, t1        ; p2
    mova    t6, t2        ; q2
    mova    t7, t3        ; q3
%endif
%endmacro

; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
%macro LUMA_H_INTRA_STORE 9
%if mmsize == 8
    TRANSPOSE4x4W %1, %2, %3, %4, %9
    movq       [r0-8], m%1
    movq       [r0+r1-8], m%2
    movq       [r0+r1*2-8], m%3
    movq       [r0+r4-8], m%4
    movq       m%1, %8
    TRANSPOSE4x4W %5, %6, %7, %1, %9
    movq       [r0], m%5
    movq       [r0+r1], m%6
    movq       [r0+r1*2], m%7
    movq       [r0+r4], m%1
%else
    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
    movq       [r0-8], m%1
    movq       [r0+r1-8], m%2
    movq       [r0+r1*2-8], m%3
    movq       [r0+r5-8], m%4
    movhps     [r4-8], m%1
    movhps     [r4+r1-8], m%2
    movhps     [r4+r1*2-8], m%3
    movhps     [r4+r5-8], m%4
%ifnum %8
    SWAP       %1, %8
%else
    mova       m%1, %8
%endif
    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
    movq       [r0], m%5
    movq       [r0+r1], m%6
    movq       [r0+r1*2], m%7
    movq       [r0+r5], m%1
    movhps     [r4], m%5
    movhps     [r4+r1], m%6
    movhps     [r4+r1*2], m%7
    movhps     [r4+r5], m%1
%endif
%endmacro

%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA_INTRA_64 0
cglobal deblock_v_luma_intra, 4,7,16
    %define t0 m1
    %define t1 m2
    %define t2 m4
    %define p2 m8
    %define p1 m9
    %define p0 m10
    %define q0 m11
    %define q1 m12
    %define q2 m13
    %define aa m5
    %define bb m14
    add     r1, r1
    lea     r4, [r1*4]
    lea     r5, [r1*3] ; 3*stride
    neg     r4
    add     r4, r0     ; pix-4*stride
    mov     r6, 2
    mova    m0, [pw_2]
    LOAD_AB aa, bb, r2d, r3d
.loop
    mova    p2, [r4+r1]
    mova    p1, [r4+2*r1]
    mova    p0, [r4+r5]
    mova    q0, [r0]
    mova    q1, [r0+r1]
    mova    q2, [r0+2*r1]

    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
    mova    t2, aa
    psrlw   t2, 2
    paddw   t2, m0 ; alpha/4+2
    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
    pand    m6, m3
    pand    m7, m6
    pand    m6, t1
    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
    add     r0, mmsize
    add     r4, mmsize
    dec     r6
    jg .loop
    REP_RET

;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,16
    %define t0 m15
    %define t1 m14
    %define t2 m2
    %define q3 m5
    %define q2 m8
    %define q1 m9
    %define q0 m10
    %define p0 m11
    %define p1 m12
    %define p2 m13
    %define p3 m4
    %define spill [rsp]
    %assign pad 24-(stack_offset&15)
    SUB     rsp, pad
    add     r1, r1
    lea     r4, [r1*4]
    lea     r5, [r1*3] ; 3*stride
    add     r4, r0     ; pix+4*stride
    mov     r6, 2
    mova    m0, [pw_2]
.loop
    movu    q3, [r0-8]
    movu    q2, [r0+r1-8]
    movu    q1, [r0+r1*2-8]
    movu    q0, [r0+r5-8]
    movu    p0, [r4-8]
    movu    p1, [r4+r1-8]
    movu    p2, [r4+r1*2-8]
    movu    p3, [r4+r5-8]
    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1

    LOAD_AB m1, m2, r2d, r3d
    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
    psrlw   m1, 2
    paddw   m1, m0 ; alpha/4+2
    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
    pand    m6, m3
    pand    m7, m6
    pand    m6, t1

    mova spill, q3
    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
    mova    m7, spill

    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14

    lea     r0, [r0+r1*8]
    lea     r4, [r4+r1*8]
    dec     r6
    jg .loop
    ADD    rsp, pad
    RET
%endmacro

INIT_XMM sse2
DEBLOCK_LUMA_INTRA_64
INIT_XMM avx
DEBLOCK_LUMA_INTRA_64

%endif

%macro DEBLOCK_LUMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_intra, 4,7,8
    LUMA_INTRA_INIT 3
    lea     r4, [r1*4]
    lea     r5, [r1*3]
    neg     r4
    add     r4, r0
    mov     r6, 32/mmsize
.loop:
    mova    m0, [r4+r1*2] ; p1
    mova    m1, [r4+r5]   ; p0
    mova    m2, [r0]      ; q0
    mova    m3, [r0+r1]   ; q1
    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
    mova    t3, [r0+r1*2] ; q2
    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
    add     r0, mmsize
    add     r4, mmsize
    dec     r6
    jg .loop
    ADD    rsp, pad
    RET

;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,8
    LUMA_INTRA_INIT 8
%if mmsize == 8
    lea     r4, [r1*3]
    mov     r5, 32/mmsize
%else
    lea     r4, [r1*4]
    lea     r5, [r1*3] ; 3*stride
    add     r4, r0     ; pix+4*stride
    mov     r6, 32/mmsize
%endif
.loop:
    LUMA_H_INTRA_LOAD
    LUMA_INTRA_INTER t8, t9, t10, t5, t6

    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
    mova    t3, t6     ; q2
    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5

    mova    m2, t4
    mova    m0, t11
    mova    m1, t5
    mova    m3, t8
    mova    m6, t6

    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7

    lea     r0, [r0+r1*(mmsize/2)]
%if mmsize == 8
    dec     r5
%else
    lea     r4, [r4+r1*(mmsize/2)]
    dec     r6
%endif
    jg .loop
    ADD    rsp, pad
    RET
%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
INIT_XMM sse2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
INIT_XMM avx
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%endif
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
    [base], [base+stride], [base+stride*2], [base3], \
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]

%define PASS8ROWS(base, base3, stride, stride3, offset) \
    PASS8ROWS(base+offset, base3+offset, stride, stride3)

; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 bytes in m0..m3
%macro TRANSPOSE4x8_LOAD 11
    movh       m0, %4
    movh       m2, %5
    movh       m1, %6
    movh       m3, %7
    punpckl%1  m0, m2
    punpckl%1  m1, m3
    mova       m2, m0
    punpckl%2  m0, m1
    punpckh%2  m2, m1

    movh       m4, %8
    movh       m6, %9
    movh       m5, %10
    movh       m7, %11
    punpckl%1  m4, m6
    punpckl%1  m5, m7
    mova       m6, m4
    punpckl%2  m4, m5
    punpckh%2  m6, m5

    punpckh%3  m1, m0, m4
    punpckh%3  m3, m2, m6
    punpckl%3  m0, m4
    punpckl%3  m2, m6
%endmacro

; in: 4 rows of 8 bytes in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
    punpckhdq  m4, m0, m0
    punpckhdq  m5, m1, m1
    punpckhdq  m6, m2, m2

    punpcklbw  m0, m1
    punpcklbw  m2, m3
    punpcklwd  m1, m0, m2
    punpckhwd  m0, m2
    movh       %1, m1
    punpckhdq  m1, m1
    movh       %2, m1
    movh       %3, m0
    punpckhdq  m0, m0
    movh       %4, m0

    punpckhdq  m3, m3
    punpcklbw  m4, m5
    punpcklbw  m6, m3
    punpcklwd  m5, m4, m6
    punpckhwd  m4, m6
    movh       %5, m5
    punpckhdq  m5, m5
    movh       %6, m5
    movh       %7, m4
    punpckhdq  m4, m4
    movh       %8, m4
%endmacro

%macro TRANSPOSE4x8B_LOAD 8
    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
%endmacro

%macro TRANSPOSE4x8W_LOAD 8
%if mmsize==16
    TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
%else
    SWAP  1, 4, 2, 3
    mova  m0, [t5]
    mova  m1, [t5+r1]
    mova  m2, [t5+r1*2]
    mova  m3, [t5+t6]
    TRANSPOSE4x4W 0, 1, 2, 3, 4
%endif
%endmacro

%macro TRANSPOSE8x2W_STORE 8
    punpckhwd  m0, m1, m2
    punpcklwd  m1, m2
%if mmsize==8
    movd       %3, m0
    movd       %1, m1
    psrlq      m1, 32
    psrlq      m0, 32
    movd       %2, m1
    movd       %4, m0
%else
    movd       %5, m0
    movd       %1, m1
    psrldq     m1, 4
    psrldq     m0, 4
    movd       %2, m1
    movd       %6, m0
    psrldq     m1, 4
    psrldq     m0, 4
    movd       %3, m1
    movd       %7, m0
    psrldq     m1, 4
    psrldq     m0, 4
    movd       %4, m1
    movd       %8, m0
%endif
%endmacro

%macro SBUTTERFLY3 4
    punpckh%1  %4, %2, %3
    punpckl%1  %2, %3
%endmacro

; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
%macro TRANSPOSE6x8_MEM 9
    RESET_MM_PERMUTATION
    movq  m0, %1
    movq  m1, %2
    movq  m2, %3
    movq  m3, %4
    movq  m4, %5
    movq  m5, %6
    movq  m6, %7
    SBUTTERFLY bw, 0, 1, 7
    SBUTTERFLY bw, 2, 3, 7
    SBUTTERFLY bw, 4, 5, 7
    movq  [%9+0x10], m3
    SBUTTERFLY3 bw, m6, %8, m7
    SBUTTERFLY wd, 0, 2, 3
    SBUTTERFLY wd, 4, 6, 3
    punpckhdq m0, m4
    movq  [%9+0x00], m0
    SBUTTERFLY3 wd, m1, [%9+0x10], m3
    SBUTTERFLY wd, 5, 7, 0
    SBUTTERFLY dq, 1, 5, 0
    SBUTTERFLY dq, 2, 6, 0
    punpckldq m3, m7
    movq  [%9+0x10], m2
    movq  [%9+0x20], m6
    movq  [%9+0x30], m1
    movq  [%9+0x40], m5
    movq  [%9+0x50], m3
    RESET_MM_PERMUTATION
%endmacro

; in: 8 rows of 8 in %1..%8
; out: 8 rows of 8 in %9..%16
%macro TRANSPOSE8x8_MEM 16
    RESET_MM_PERMUTATION
    movq  m0, %1
    movq  m1, %2
    movq  m2, %3
    movq  m3, %4
    movq  m4, %5
    movq  m5, %6
    movq  m6, %7
    SBUTTERFLY bw, 0, 1, 7
    SBUTTERFLY bw, 2, 3, 7
    SBUTTERFLY bw, 4, 5, 7
    SBUTTERFLY3 bw, m6, %8, m7
    movq  %9,  m5
    SBUTTERFLY wd, 0, 2, 5
    SBUTTERFLY wd, 4, 6, 5
    SBUTTERFLY wd, 1, 3, 5
    movq  %11, m6
    movq  m6,  %9
    SBUTTERFLY wd, 6, 7, 5
    SBUTTERFLY dq, 0, 4, 5
    SBUTTERFLY dq, 1, 6, 5
    movq  %9,  m0
    movq  %10, m4
    movq  %13, m1
    movq  %14, m6
    SBUTTERFLY3 dq, m2, %11, m0
    SBUTTERFLY dq, 3, 7, 4
    movq  %11, m2
    movq  %12, m0
    movq  %15, m3
    movq  %16, m7
    RESET_MM_PERMUTATION
%endmacro

; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT 5
%if avx_enabled == 0
    mova    %5, %2
    mova    %4, %1
    psubusb %5, %1
    psubusb %4, %2
%else
    psubusb %5, %2, %1
    psubusb %4, %1, %2
%endif
    por     %4, %5
    psubusb %4, %3
%endmacro

; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT2 5
%ifdef ARCH_X86_64
    psubusb %5, %2, %1
    psubusb %4, %1, %2
%else
    mova    %5, %2
    mova    %4, %1
    psubusb %5, %1
    psubusb %4, %2
%endif
    psubusb %5, %3
    psubusb %4, %3
    pcmpeqb %4, %5
%endmacro

; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
; out: m5=beta-1, m7=mask, %3=alpha-1
; clobbers: m4,m6
%macro LOAD_MASK 2-3
    movd     m4, %1
    movd     m5, %2
    SPLATW   m4, m4
    SPLATW   m5, m5
    packuswb m4, m4  ; 16x alpha-1
    packuswb m5, m5  ; 16x beta-1
%if %0>2
    mova     %3, m4
%endif
    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
    por      m7, m4
    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
    por      m7, m4
    pxor     m6, m6
    pcmpeqb  m7, m6
%endmacro

; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
; out: m1=p0' m2=q0'
; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0
    pxor    m5, m1, m2   ; p0^q0
    pand    m5, [pb_1]   ; (p0^q0)&1
    pcmpeqb m4, m4
    pxor    m3, m4
    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
    pxor    m4, m1
    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
    pavgb   m3, m5
    paddusb m3, m4       ; d+128+33
    mova    m6, [pb_a1]
    psubusb m6, m3
    psubusb m3, [pb_a1]
    pminub  m6, m7
    pminub  m3, m7
    psubusb m1, m6
    psubusb m2, m3
    paddusb m1, m3
    paddusb m2, m6
%endmacro

; in: m1=p0 m2=q0
;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
; clobbers: q2, tmp, tc0
%macro LUMA_Q1 6
    pavgb   %6, m1, m2
    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
    pxor    %6, %3
    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
    psubusb %6, %1, %5
    paddusb %5, %1
    pmaxub  %2, %6
    pminub  %2, %5
    mova    %4, %2
%endmacro

%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA 0
cglobal deblock_v_luma, 5,5,10
    movd    m8, [r4] ; tc0
    lea     r4, [r1*3]
    dec     r2d        ; alpha-1
    neg     r4
    dec     r3d        ; beta-1
    add     r4, r0     ; pix-3*stride

    mova    m0, [r4+r1]   ; p1
    mova    m1, [r4+2*r1] ; p0
    mova    m2, [r0]      ; q0
    mova    m3, [r0+r1]   ; q1
    LOAD_MASK r2d, r3d

    punpcklbw m8, m8
    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
    pcmpeqb m9, m9
    pcmpeqb m9, m8
    pandn   m9, m7
    pand    m8, m9

    movdqa  m3, [r4] ; p2
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
    pand    m6, m9
    psubb   m7, m8, m6
    pand    m6, m8
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4

    movdqa  m4, [r0+2*r1] ; q2
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
    pand    m6, m9
    pand    m8, m6
    psubb   m7, m6
    mova    m3, [r0+r1]
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6

    DEBLOCK_P0_Q0
    mova    [r4+2*r1], m1
    mova    [r0], m2
    RET

;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma, 5,7
    movsxd r10, r1d
    lea    r11, [r10+r10*2]
    lea    r6,  [r0-4]
    lea    r5,  [r0-4+r11]
%ifdef WIN64
    sub    rsp, 0x98
    %define pix_tmp rsp+0x30
%else
    sub    rsp, 0x68
    %define pix_tmp rsp
%endif

    ; transpose 6x16 -> tmp space
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
    lea    r6, [r6+r10*8]
    lea    r5, [r5+r10*8]
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8

    ; vertical filter
    ; alpha, beta, tc0 are still in r2d, r3d, r4
    ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
    lea    r0, [pix_tmp+0x30]
    mov    r1d, 0x10
%ifdef WIN64
    mov    [rsp+0x20], r4
%endif
    call   deblock_v_luma

    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
    add    r6, 2
    add    r5, 2
    movq   m0, [pix_tmp+0x18]
    movq   m1, [pix_tmp+0x28]
    movq   m2, [pix_tmp+0x38]
    movq   m3, [pix_tmp+0x48]
    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)

    shl    r10, 3
    sub    r6,  r10
    sub    r5,  r10
    shr    r10, 3
    movq   m0, [pix_tmp+0x10]
    movq   m1, [pix_tmp+0x20]
    movq   m2, [pix_tmp+0x30]
    movq   m3, [pix_tmp+0x40]
    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)

%ifdef WIN64
    add    rsp, 0x98
%else
    add    rsp, 0x68
%endif
    RET
%endmacro

INIT_XMM sse2
DEBLOCK_LUMA
INIT_XMM avx
DEBLOCK_LUMA

%else

%macro DEBLOCK_LUMA 2
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma, 5,5
    lea     r4, [r1*3]
    dec     r2     ; alpha-1
    neg     r4
    dec     r3     ; beta-1
    add     r4, r0 ; pix-3*stride
    %assign pad 2*%2+12-(stack_offset&15)
    SUB     esp, pad

    mova    m0, [r4+r1]   ; p1
    mova    m1, [r4+2*r1] ; p0
    mova    m2, [r0]      ; q0
    mova    m3, [r0+r1]   ; q1
    LOAD_MASK r2, r3

    mov     r3, r4mp
    movd    m4, [r3] ; tc0
    punpcklbw m4, m4
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
    mova   [esp+%2], m4 ; tc
    pcmpeqb m3, m3
    pcmpgtb m4, m3
    pand    m4, m7
    mova   [esp], m4 ; mask

    mova    m3, [r4] ; p2
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
    pand    m6, m4
    pand    m4, [esp+%2] ; tc
    psubb   m7, m4, m6
    pand    m6, m4
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4

    mova    m4, [r0+2*r1] ; q2
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
    mova    m5, [esp] ; mask
    pand    m6, m5
    mova    m5, [esp+%2] ; tc
    pand    m5, m6
    psubb   m7, m6
    mova    m3, [r0+r1]
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6

    DEBLOCK_P0_Q0
    mova    [r4+2*r1], m1
    mova    [r0], m2
    ADD     esp, pad
    RET

;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma, 0,5
    mov    r0, r0mp
    mov    r3, r1m
    lea    r4, [r3*3]
    sub    r0, 4
    lea    r1, [r0+r4]
    %assign pad 0x78-(stack_offset&15)
    SUB    esp, pad
%define pix_tmp esp+12

    ; transpose 6x16 -> tmp space
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
    lea    r0, [r0+r3*8]
    lea    r1, [r1+r3*8]
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8

    ; vertical filter
    lea    r0, [pix_tmp+0x30]
    PUSH   dword r4m
    PUSH   dword r3m
    PUSH   dword r2m
    PUSH   dword 16
    PUSH   dword r0
    call   deblock_%1_luma
%ifidn %1, v8
    add    dword [esp   ], 8 ; pix_tmp+0x38
    add    dword [esp+16], 2 ; tc0+2
    call   deblock_%1_luma
%endif
    ADD    esp, 20

    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
    mov    r0, r0mp
    sub    r0, 2
    lea    r1, [r0+r4]

    movq   m0, [pix_tmp+0x10]
    movq   m1, [pix_tmp+0x20]
    movq   m2, [pix_tmp+0x30]
    movq   m3, [pix_tmp+0x40]
    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)

    lea    r0, [r0+r3*8]
    lea    r1, [r1+r3*8]
    movq   m0, [pix_tmp+0x18]
    movq   m1, [pix_tmp+0x28]
    movq   m2, [pix_tmp+0x38]
    movq   m3, [pix_tmp+0x48]
    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)

    ADD    esp, pad
    RET
%endmacro ; DEBLOCK_LUMA

INIT_MMX mmx2
DEBLOCK_LUMA v8, 8
INIT_XMM sse2
DEBLOCK_LUMA v, 16
INIT_XMM avx
DEBLOCK_LUMA v, 16

%endif ; ARCH



%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
%ifdef ARCH_X86_64
    pavgb t0, p2, p1
    pavgb t1, p0, q0
%else
    mova  t0, p2
    mova  t1, p0
    pavgb t0, p1
    pavgb t1, q0
%endif
    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
    mova  t5, t1
%ifdef ARCH_X86_64
    paddb t2, p2, p1
    paddb t3, p0, q0
%else
    mova  t2, p2
    mova  t3, p0
    paddb t2, p1
    paddb t3, q0
%endif
    paddb t2, t3
    mova  t3, t2
    mova  t4, t2
    psrlw t2, 1
    pavgb t2, mpb_0
    pxor  t2, t0
    pand  t2, mpb_1
    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;

%ifdef ARCH_X86_64
    pavgb t1, p2, q1
    psubb t2, p2, q1
%else
    mova  t1, p2
    mova  t2, p2
    pavgb t1, q1
    psubb t2, q1
%endif
    paddb t3, t3
    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
    pand  t2, mpb_1
    psubb t1, t2
    pavgb t1, p1
    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
    psrlw t3, 2
    pavgb t3, mpb_0
    pxor  t3, t1
    pand  t3, mpb_1
    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8

    pxor  t3, p0, q1
    pavgb t2, p0, q1
    pand  t3, mpb_1
    psubb t2, t3
    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4

    pxor  t1, t2
    pxor  t2, p0
    pand  t1, mask1p
    pand  t2, mask0
    pxor  t1, t2
    pxor  t1, p0
    mova  %1, t1 ; store p0

    mova  t1, %4 ; p3
    paddb t2, t1, p2
    pavgb t1, p2
    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
    paddb t2, t2
    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
    psrlw t2, 2
    pavgb t2, mpb_0
    pxor  t2, t1
    pand  t2, mpb_1
    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8

    pxor  t0, p1
    pxor  t1, p2
    pand  t0, mask1p
    pand  t1, mask1p
    pxor  t0, p1
    pxor  t1, p2
    mova  %2, t0 ; store p1
    mova  %3, t1 ; store p2
%endmacro

%macro LUMA_INTRA_SWAP_PQ 0
    %define q1 m0
    %define q0 m1
    %define p0 m2
    %define p1 m3
    %define p2 q2
    %define mask1p mask1q
%endmacro

%macro DEBLOCK_LUMA_INTRA 1
    %define p1 m0
    %define p0 m1
    %define q0 m2
    %define q1 m3
    %define t0 m4
    %define t1 m5
    %define t2 m6
    %define t3 m7
%ifdef ARCH_X86_64
    %define p2 m8
    %define q2 m9
    %define t4 m10
    %define t5 m11
    %define mask0 m12
    %define mask1p m13
    %define mask1q [rsp-24]
    %define mpb_0 m14
    %define mpb_1 m15
%else
    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
    %define p2 [r4+r1]
    %define q2 [r0+2*r1]
    %define t4 spill(0)
    %define t5 spill(1)
    %define mask0 spill(2)
    %define mask1p spill(3)
    %define mask1q spill(4)
    %define mpb_0 [pb_0]
    %define mpb_1 [pb_1]
%endif

;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_intra, 4,6,16
%ifndef ARCH_X86_64
    sub     esp, 0x60
%endif
    lea     r4, [r1*4]
    lea     r5, [r1*3] ; 3*stride
    dec     r2d        ; alpha-1
    jl .end
    neg     r4
    dec     r3d        ; beta-1
    jl .end
    add     r4, r0     ; pix-4*stride
    mova    p1, [r4+2*r1]
    mova    p0, [r4+r5]
    mova    q0, [r0]
    mova    q1, [r0+r1]
%ifdef ARCH_X86_64
    pxor    mpb_0, mpb_0
    mova    mpb_1, [pb_1]
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
    SWAP    7, 12 ; m12=mask0
    pavgb   t5, mpb_0
    pavgb   t5, mpb_1 ; alpha/4+1
    movdqa  p2, [r4+r1]
    movdqa  q2, [r0+2*r1]
    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
    pand    t0, mask0
    pand    t4, t0
    pand    t2, t0
    mova    mask1q, t4
    mova    mask1p, t2
%else
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
    mova    m4, t5
    mova    mask0, m7
    pavgb   m4, [pb_0]
    pavgb   m4, [pb_1] ; alpha/4+1
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
    pand    m6, mask0
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
    pand    m4, m6
    mova    mask1p, m4
    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
    pand    m4, m6
    mova    mask1q, m4
%endif
    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
    LUMA_INTRA_SWAP_PQ
    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
%ifndef ARCH_X86_64
    add     esp, 0x60
%endif
    RET

INIT_MMX cpuname
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7
    movsxd r10, r1d
    lea    r11, [r10*3]
    lea    r6,  [r0-4]
    lea    r5,  [r0-4+r11]
    sub    rsp, 0x88
    %define pix_tmp rsp

    ; transpose 8x16 -> tmp space
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
    lea    r6, [r6+r10*8]
    lea    r5, [r5+r10*8]
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)

    lea    r0,  [pix_tmp+0x40]
    mov    r1,  0x10
    call   deblock_v_luma_intra

    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
    lea    r5, [r6+r11]
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
    shl    r10, 3
    sub    r6,  r10
    sub    r5,  r10
    shr    r10, 3
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
    add    rsp, 0x88
    RET
%else
cglobal deblock_h_luma_intra, 2,4
    lea    r3,  [r1*3]
    sub    r0,  4
    lea    r2,  [r0+r3]
%assign pad 0x8c-(stack_offset&15)
    SUB    rsp, pad
    %define pix_tmp rsp

    ; transpose 8x16 -> tmp space
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
    lea    r0,  [r0+r1*8]
    lea    r2,  [r2+r1*8]
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)

    lea    r0,  [pix_tmp+0x40]
    PUSH   dword r3m
    PUSH   dword r2m
    PUSH   dword 16
    PUSH   r0
    call   deblock_%1_luma_intra
%ifidn %1, v8
    add    dword [rsp], 8 ; pix_tmp+8
    call   deblock_%1_luma_intra
%endif
    ADD    esp, 16

    mov    r1,  r1m
    mov    r0,  r0mp
    lea    r3,  [r1*3]
    sub    r0,  4
    lea    r2,  [r0+r3]
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
    lea    r0,  [r0+r1*8]
    lea    r2,  [r2+r1*8]
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
    ADD    rsp, pad
    RET
%endif ; ARCH_X86_64
%endmacro ; DEBLOCK_LUMA_INTRA

INIT_XMM sse2
DEBLOCK_LUMA_INTRA v
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_LUMA_INTRA v8
%endif
%endif ; !HIGH_BIT_DEPTH

%ifdef HIGH_BIT_DEPTH
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', %2=q0'
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
    mova    %6, [pw_2]
    paddw   %6, %3
    paddw   %6, %4
    paddw   %7, %6, %2
    paddw   %6, %1
    paddw   %6, %3
    paddw   %7, %4
    psraw   %6, 2
    psraw   %7, 2
    psubw   %6, %1
    psubw   %7, %2
    pand    %6, %5
    pand    %7, %5
    paddw   %1, %6
    paddw   %2, %7
%endmacro

; out: m0-m3
; clobbers: m4-m7
%macro CHROMA_H_LOAD 0-1
    movq        m0, [r0-8] ; p1 p1 p0 p0
    movq        m2, [r0]   ; q0 q0 q1 q1
    movq        m5, [r0+r1-8]
    movq        m7, [r0+r1]
%if mmsize == 8
    mova        m1, m0
    mova        m3, m2
    punpckldq   m0, m5 ; p1
    punpckhdq   m1, m5 ; p0
    punpckldq   m2, m7 ; q0
    punpckhdq   m3, m7 ; q1
%else
    movq        m4, [r0+r1*2-8]
    movq        m6, [r0+r1*2]
    movq        m1, [r0+%1-8]
    movq        m3, [r0+%1]
    punpckldq   m0, m5 ; p1 ... p0 ...
    punpckldq   m2, m7 ; q0 ... q1 ...
    punpckldq   m4, m1
    punpckldq   m6, m3
    punpckhqdq  m1, m0, m4 ; p0
    punpcklqdq  m0, m4 ; p1
    punpckhqdq  m3, m2, m6 ; q1
    punpcklqdq  m2, m6 ; q0
%endif
%endmacro

%macro CHROMA_V_LOAD 1
    mova        m0, [r0]    ; p1
    mova        m1, [r0+r1] ; p0
    mova        m2, [%1]    ; q0
    mova        m3, [%1+r1] ; q1
%endmacro

; clobbers: m1, m2, m3
%macro CHROMA_H_STORE 0-1
    SBUTTERFLY dq, 1, 2, 3
%if mmsize == 8
    movq      [r0-4], m1
    movq   [r0+r1-4], m2
%else
    movq      [r0-4], m1
    movq [r0+r1*2-4], m2
    movhps [r0+r1-4], m1
    movhps [r0+%1-4], m2
%endif
%endmacro

%macro CHROMA_V_STORE 0
    mova [r0+1*r1], m1
    mova [r0+2*r1], m2
%endmacro

%macro DEBLOCK_CHROMA 0
cglobal deblock_inter_body
    RESET_MM_PERMUTATION
    LOAD_AB     m4, m5, r2, r3
    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
    pxor        m4, m4
    LOAD_TC     m6, r4
    pmaxsw      m6, m4
    pand        m7, m6
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
    ret

;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma, 7,7,8
    FIX_STRIDES r1
    mov         r5, r0
    sub         r0, r1
    sub         r0, r1
    mov         r6, 32/mmsize
.loop:
    CHROMA_V_LOAD r5
    call        deblock_inter_body
    CHROMA_V_STORE
    add         r0, mmsize
    add         r5, mmsize
    add         r4, mmsize/8
    dec         r6
    jg .loop
    REP_RET

;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
    add         r1, r1
    mov         r5, 32/mmsize
%if mmsize == 16
    lea         r6, [r1*3]
%endif
.loop:
    CHROMA_H_LOAD r6
    call        deblock_inter_body
    CHROMA_H_STORE r6
    lea         r0, [r0+r1*(mmsize/4)]
    add         r4, mmsize/8
    dec         r5
    jg .loop
    REP_RET


cglobal deblock_intra_body
    RESET_MM_PERMUTATION
    LOAD_AB     m4, m5, r2, r3
    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
    ret

;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,6,8
    add         r1, r1
    mov         r5, 32/mmsize
    movd        m5, r3
    mov         r4, r0
    sub         r0, r1
    sub         r0, r1
    SPLATW      m5, m5
.loop:
    CHROMA_V_LOAD r4
    call        deblock_intra_body
    CHROMA_V_STORE
    add         r0, mmsize
    add         r4, mmsize
    dec         r5
    jg .loop
    REP_RET

;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
    add         r1, r1
    mov         r4, 32/mmsize
%if mmsize == 16
    lea         r5, [r1*3]
%endif
.loop:
    CHROMA_H_LOAD r5
    call        deblock_intra_body
    CHROMA_H_STORE r5
    lea         r0, [r0+r1*(mmsize/4)]
    dec         r4
    jg .loop
    REP_RET
%endmacro

%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA
%endif ; HIGH_BIT_DEPTH

%ifndef HIGH_BIT_DEPTH
%macro CHROMA_V_START 0
    dec    r2d      ; alpha-1
    dec    r3d      ; beta-1
    mov    t5, r0
    sub    t5, r1
    sub    t5, r1
%if mmsize==8
    mov   dword r0m, 2
.skip_prologue:
%endif
%endmacro

%macro CHROMA_H_START 0
    dec    r2d
    dec    r3d
    sub    r0, 4
    lea    t6, [r1*3]
    mov    t5, r0
    add    r0, t6
%if mmsize==8
    mov   dword r0m, 2
.skip_prologue:
%endif
%endmacro

%macro CHROMA_V_LOOP 1
%if mmsize==8
    add   r0, 8
    add   t5, 8
%if %1
    add   r4, 2
%endif
    dec   dword r0m
    jg .skip_prologue
%endif
%endmacro

%macro CHROMA_H_LOOP 1
%if mmsize==8
    lea   r0, [r0+r1*4]
    lea   t5, [t5+r1*4]
%if %1
    add   r4, 2
%endif
    dec   dword r0m
    jg .skip_prologue
%endif
%endmacro

%define t5 r5
%define t6 r6

%macro DEBLOCK_CHROMA 0
cglobal chroma_inter_body
    LOAD_MASK  r2d, r3d
    movd       m6, [r4] ; tc0
    punpcklbw  m6, m6
    punpcklbw  m6, m6
    pand       m7, m6
    DEBLOCK_P0_Q0
    ret

;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma, 5,6,8
    CHROMA_V_START
    mova  m0, [t5]
    mova  m1, [t5+r1]
    mova  m2, [r0]
    mova  m3, [r0+r1]
    call chroma_inter_body
    mova  [t5+r1], m1
    mova  [r0], m2
    CHROMA_V_LOOP 1
    RET

;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
    CHROMA_H_START
    TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
    call chroma_inter_body
    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
    CHROMA_H_LOOP 1
    RET
%endmacro ; DEBLOCK_CHROMA

INIT_XMM sse2
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA
%endif

%macro DEBLOCK_H_CHROMA_422 0
cglobal deblock_h_chroma_422, 5,7,8
%ifdef ARCH_X86_64
    %define cntr r11
%else
    %define cntr dword r0m
%endif
    dec    r2d
    dec    r3d
    sub    r0, 4
    lea    t6, [r1*3]
    mov    t5, r0
    add    r0, t6
    mov  cntr, 32/mmsize
.skip_prologue:
    TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
    LOAD_MASK  r2d, r3d
    movd       m6, [r4] ; tc0
    punpcklbw  m6, m6
%if mmsize == 16
    punpcklbw  m6, m6
    punpcklbw  m6, m6
%else
    pshufw     m6, m6, q0000
%endif
    pand       m7, m6
    DEBLOCK_P0_Q0
    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
    lea   r0, [r0+r1*(mmsize/2)]
    lea   t5, [t5+r1*(mmsize/2)]
    add   r4, mmsize/8
    dec   cntr
    jg .skip_prologue
    REP_RET
%endmacro

INIT_MMX mmx2
DEBLOCK_H_CHROMA_422
INIT_XMM sse2
DEBLOCK_H_CHROMA_422
INIT_XMM avx
DEBLOCK_H_CHROMA_422

; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
%macro CHROMA_INTRA_P0 3
    pxor    m4, %1, %3
    pand    m4, [pb_1] ; m4 = (p0^q1)&1
    pavgb   %1, %3
    psubusb %1, m4
    pavgb   %1, %2     ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro

%define t5 r4
%define t6 r5

%macro DEBLOCK_CHROMA_INTRA 0
cglobal chroma_intra_body
    LOAD_MASK r2d, r3d
    mova   m5, m1
    mova   m6, m2
    CHROMA_INTRA_P0  m1, m0, m3
    CHROMA_INTRA_P0  m2, m3, m0
    psubb  m1, m5
    psubb  m2, m6
    pand   m1, m7
    pand   m2, m7
    paddb  m1, m5
    paddb  m2, m6
    ret

;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,5,8
    CHROMA_V_START
    mova  m0, [t5]
    mova  m1, [t5+r1]
    mova  m2, [r0]
    mova  m3, [r0+r1]
    call chroma_intra_body
    mova  [t5+r1], m1
    mova  [r0], m2
    CHROMA_V_LOOP 0
    RET

;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
    CHROMA_H_START
    TRANSPOSE4x8W_LOAD  PASS8ROWS(t5, r0, r1, t6)
    call chroma_intra_body
    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
    CHROMA_H_LOOP 0
    RET
%endmacro ; DEBLOCK_CHROMA_INTRA

INIT_XMM sse2
DEBLOCK_CHROMA_INTRA
INIT_XMM avx
DEBLOCK_CHROMA_INTRA
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA_INTRA
%endif
%endif ; !HIGH_BIT_DEPTH



;-----------------------------------------------------------------------------
; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
;                               uint8_t bs[2][4][4], int mvy_limit, int bframe )
;-----------------------------------------------------------------------------

%define scan8start (4+1*8)
%define nnz r0+scan8start
%define ref r1+scan8start
%define mv  r2+scan8start*4
%define bs0 r3
%define bs1 r3+32

%macro LOAD_BYTES_MMX 1
    movd      m2, [%1+8*0-1]
    movd      m0, [%1+8*0]
    movd      m3, [%1+8*2-1]
    movd      m1, [%1+8*2]
    punpckldq m2, [%1+8*1-1]
    punpckldq m0, [%1+8*1]
    punpckldq m3, [%1+8*3-1]
    punpckldq m1, [%1+8*3]
%endmacro

%macro DEBLOCK_STRENGTH_REFS_MMX 0
    LOAD_BYTES_MMX ref
    pxor      m2, m0
    pxor      m3, m1
    por       m2, [bs0+0]
    por       m3, [bs0+8]
    movq [bs0+0], m2
    movq [bs0+8], m3

    movd      m2, [ref-8*1]
    movd      m3, [ref+8*1]
    punpckldq m2, m0  ; row -1, row 0
    punpckldq m3, m1  ; row  1, row 2
    pxor      m0, m2
    pxor      m1, m3
    por       m0, [bs1+0]
    por       m1, [bs1+8]
    movq [bs1+0], m0
    movq [bs1+8], m1
%endmacro

%macro DEBLOCK_STRENGTH_MVS_MMX 2
    mova      m0, [mv-%2]
    mova      m1, [mv-%2+8]
    psubw     m0, [mv]
    psubw     m1, [mv+8]
    packsswb  m0, m1
    ABSB      m0, m1
    psubusb   m0, m7
    packsswb  m0, m0
    por       m0, [%1]
    movd    [%1], m0
%endmacro

%macro DEBLOCK_STRENGTH_NNZ_MMX 1
    por       m2, m0
    por       m3, m1
    mova      m4, [%1]
    mova      m5, [%1+8]
    pminub    m2, m6
    pminub    m3, m6
    pminub    m4, m6 ; mv ? 1 : 0
    pminub    m5, m6
    paddb     m2, m2 ; nnz ? 2 : 0
    paddb     m3, m3
    pmaxub    m2, m4
    pmaxub    m3, m5
%endmacro

%macro LOAD_BYTES_XMM 1
    movu      m0, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
    movu      m1, [%1+12]
    mova      m2, m0
    pslldq    m0, 1
    shufps    m2, m1, q3131 ; cur nnz, all rows
    pslldq    m1, 1
    shufps    m0, m1, q3131 ; left neighbors
    pslldq    m1, m2, 4
    movd      m3, [%1-8] ; could be palignr if nnz was aligned
    por       m1, m3 ; top neighbors
%endmacro

INIT_MMX mmx2
cglobal deblock_strength, 6,6
    ; Prepare mv comparison register
    shl      r4d, 8
    add      r4d, 3 - (1<<8)
    movd      m7, r4d
    SPLATW    m7, m7
    mova      m6, [pb_1]
    pxor      m0, m0
    mova [bs0+0], m0
    mova [bs0+8], m0
    mova [bs1+0], m0
    mova [bs1+8], m0

.lists:
    DEBLOCK_STRENGTH_REFS_MMX
    mov      r4d, 4
.mvs:
    DEBLOCK_STRENGTH_MVS_MMX bs0, 4
    DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
    add       r2, 4*8
    add       r3, 4
    dec      r4d
    jg .mvs
    add       r1, 40
    add       r2, 4*8
    sub       r3, 16
    dec      r5d
    jge .lists

    ; Check nnz
    LOAD_BYTES_MMX nnz
    DEBLOCK_STRENGTH_NNZ_MMX bs0
    ; Transpose column output
    SBUTTERFLY bw, 2, 3, 4
    SBUTTERFLY bw, 2, 3, 4
    mova [bs0+0], m2
    mova [bs0+8], m3
    movd      m2, [nnz-8*1]
    movd      m3, [nnz+8*1]
    punpckldq m2, m0  ; row -1, row 0
    punpckldq m3, m1  ; row  1, row 2
    DEBLOCK_STRENGTH_NNZ_MMX bs1
    mova [bs1+0], m2
    mova [bs1+8], m3
    RET

%macro DEBLOCK_STRENGTH_XMM 0
cglobal deblock_strength, 6,6,8
    ; Prepare mv comparison register
    shl      r4d, 8
    add      r4d, 3 - (1<<8)
    movd      m6, r4d
    SPLATW    m6, m6
    pxor      m4, m4 ; bs0
    pxor      m5, m5 ; bs1

.lists:
    ; Check refs
    LOAD_BYTES_XMM ref
    pxor      m0, m2
    pxor      m1, m2
    por       m4, m0
    por       m5, m1

    ; Check mvs
%if cpuflag(ssse3)
    mova      m0, [mv+4*8*0]
    mova      m1, [mv+4*8*1]
    palignr   m3, m0, [mv+4*8*0-16], 12
    palignr   m2, m1, [mv+4*8*1-16], 12
    psubw     m0, m3
    psubw     m1, m2
    packsswb  m0, m1

    mova      m2, [mv+4*8*2]
    mova      m1, [mv+4*8*3]
    palignr   m3, m2, [mv+4*8*2-16], 12
    palignr   m7, m1, [mv+4*8*3-16], 12
    psubw     m2, m3
    psubw     m1, m7
    packsswb  m2, m1
%else
    movu      m0, [mv-4+4*8*0]
    movu      m1, [mv-4+4*8*1]
    movu      m2, [mv-4+4*8*2]
    movu      m3, [mv-4+4*8*3]
    psubw     m0, [mv+4*8*0]
    psubw     m1, [mv+4*8*1]
    psubw     m2, [mv+4*8*2]
    psubw     m3, [mv+4*8*3]
    packsswb  m0, m1
    packsswb  m2, m3
%endif
    ABSB      m0, m1
    ABSB      m2, m3
    psubusb   m0, m6
    psubusb   m2, m6
    packsswb  m0, m2
    por       m4, m0

    mova      m0, [mv+4*8*-1]
    mova      m1, [mv+4*8* 0]
    mova      m2, [mv+4*8* 1]
    mova      m3, [mv+4*8* 2]
    psubw     m0, m1
    psubw     m1, m2
    psubw     m2, m3
    psubw     m3, [mv+4*8* 3]
    packsswb  m0, m1
    packsswb  m2, m3
    ABSB      m0, m1
    ABSB      m2, m3
    psubusb   m0, m6
    psubusb   m2, m6
    packsswb  m0, m2
    por       m5, m0
    add       r1, 40
    add       r2, 4*8*5
    dec      r5d
    jge .lists

    ; Check nnz
    LOAD_BYTES_XMM nnz
    por       m0, m2
    por       m1, m2
    mova      m6, [pb_1]
    pminub    m0, m6
    pminub    m1, m6
    pminub    m4, m6 ; mv ? 1 : 0
    pminub    m5, m6
    paddb     m0, m0 ; nnz ? 2 : 0
    paddb     m1, m1
    pmaxub    m4, m0
    pmaxub    m5, m1
%if cpuflag(ssse3)
    pshufb    m4, [transpose_shuf]
%else
    movhlps   m3, m4
    punpcklbw m4, m3
    movhlps   m3, m4
    punpcklbw m4, m3
%endif
    mova   [bs1], m5
    mova   [bs0], m4
    RET
%endmacro

INIT_XMM sse2
DEBLOCK_STRENGTH_XMM
INIT_XMM ssse3
DEBLOCK_STRENGTH_XMM
INIT_XMM avx
DEBLOCK_STRENGTH_XMM
x264-snapshot-20120103-2245-stable/common/x86/dct.h0000644000175000001440000001604611700673342020540 0ustar  videolanusers/*****************************************************************************
 * dct.h: x86 transform and zigzag
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_I386_DCT_H
#define X264_I386_DCT_H

void x264_sub4x4_dct_mmx    ( dctcoef dct    [16], pixel   *pix1, pixel   *pix2 );
void x264_sub8x8_dct_mmx    ( dctcoef dct[ 4][16], pixel   *pix1, pixel   *pix2 );
void x264_sub16x16_dct_mmx  ( dctcoef dct[16][16], pixel   *pix1, pixel   *pix2 );
void x264_sub8x8_dct_sse2   ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_ssse3  ( int16_t dct    [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3  ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx    ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );

void x264_add4x4_idct_mmx       ( uint8_t *p_dst, int16_t dct    [16] );
void x264_add4x4_idct_sse2     ( uint16_t *p_dst, int32_t dct    [16] );
void x264_add4x4_idct_sse4      ( uint8_t *p_dst, int16_t dct    [16] );
void x264_add4x4_idct_avx       ( pixel   *p_dst, dctcoef dct    [16] );
void x264_add8x8_idct_mmx       ( uint8_t *p_dst, int16_t dct[ 4][16] );
void x264_add8x8_idct_dc_mmx    ( uint8_t *p_dst, int16_t dct    [ 4] );
void x264_add16x16_idct_mmx     ( uint8_t *p_dst, int16_t dct[16][16] );
void x264_add16x16_idct_dc_mmx  ( uint8_t *p_dst, int16_t dct    [16] );
void x264_add8x8_idct_sse2      ( pixel   *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx       ( pixel   *p_dst, dctcoef dct[ 4][16] );
void x264_add16x16_idct_sse2    ( pixel   *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx     ( pixel   *p_dst, dctcoef dct[16][16] );
void x264_add8x8_idct_dc_sse2   ( pixel   *p_dst, dctcoef dct    [ 4] );
void x264_add16x16_idct_dc_sse2 ( pixel   *p_dst, dctcoef dct    [16] );
void x264_add8x8_idct_dc_ssse3  ( uint8_t *p_dst, int16_t dct    [ 4] );
void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct    [16] );
void x264_add8x8_idct_dc_avx    ( pixel   *p_dst, dctcoef dct    [ 4] );
void x264_add16x16_idct_dc_avx  ( pixel   *p_dst, dctcoef dct    [16] );

void x264_dct4x4dc_mmx       ( int16_t d[16] );
void x264_dct4x4dc_sse2      ( int32_t d[16] );
void x264_dct4x4dc_avx       ( int32_t d[16] );
void x264_idct4x4dc_mmx      ( int16_t d[16] );
void x264_idct4x4dc_sse2     ( int32_t d[16] );
void x264_idct4x4dc_avx      ( int32_t d[16] );

void x264_sub8x8_dct8_mmx    ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx  ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_sse2   ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_ssse3  ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_avx    ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_avx  ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );


void x264_add8x8_idct8_mmx   ( uint8_t *dst, int16_t dct   [64] );
void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
void x264_add8x8_idct8_sse2  ( uint8_t *dst, int16_t dct   [64] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
void x264_add8x8_idct8_avx   ( uint8_t *dst, int16_t dct   [64] );
void x264_add16x16_idct8_avx ( uint8_t *dst, int16_t dct[4][64] );

void x264_zigzag_scan_8x8_frame_avx  ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_4x4_frame_xop  ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_avx  ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx  ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_mmx2 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_avx  ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
int  x264_zigzag_sub_4x4_frame_avx    ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int  x264_zigzag_sub_4x4_frame_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int  x264_zigzag_sub_4x4ac_frame_avx  ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int  x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int  x264_zigzag_sub_4x4_field_avx    ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int  x264_zigzag_sub_4x4_field_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int  x264_zigzag_sub_4x4ac_field_avx  ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int  x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );

#endif
x264-snapshot-20120103-2245-stable/common/x86/dct-a.asm0000644000175000001440000011707211700673342021310 0ustar  videolanusers;*****************************************************************************
;* dct-a.asm: x86 transform and zigzag
;*****************************************************************************
;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Holger Lubitz <holger@lubitz.org>
;*          Loren Merritt <lorenm@u.washington.edu>
;*          Laurent Aimar <fenrir@via.ecp.fr>
;*          Min Chen <chenm001.163.com>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA
pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field:   db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask:   dw 0,-1,-1,-1,-1,-1,-1,-1
pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7

SECTION .text

cextern pw_32_0
cextern pw_32
cextern pw_8000
cextern pw_pixel_max
cextern hsub_mul
cextern pb_1
cextern pw_1
cextern pd_1
cextern pd_32

%macro WALSH4_1D 6
    SUMSUB_BADC %1, %5, %4, %3, %2, %6
    SUMSUB_BADC %1, %5, %3, %4, %2, %6
    SWAP %2, %5, %4
%endmacro

%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
    movq  m%3, m%4
    pxor  m%1, m%4
    psubw m%3, m%2
    pxor  m%2, m%4
    pavgw m%3, m%1
    pavgw m%2, m%1
    pxor  m%3, m%4
    pxor  m%2, m%4
    SWAP %1, %2, %3
%endmacro

%macro DCT_UNPACK 3
    punpcklwd %3, %1
    punpckhwd %2, %1
    psrad     %3, 16
    psrad     %2, 16
    SWAP      %1, %3
%endmacro

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void dct4x4dc( dctcoef d[4][4] )
;-----------------------------------------------------------------------------
%macro DCT4x4_DC 0
cglobal dct4x4dc, 1,1,5
    mova   m0, [r0+ 0]
    mova   m1, [r0+16]
    mova   m2, [r0+32]
    mova   m3, [r0+48]
    WALSH4_1D  d, 0,1,2,3,4
    TRANSPOSE4x4D 0,1,2,3,4
    paddd  m0, [pd_1]
    WALSH4_1D  d, 0,1,2,3,4
    psrad  m0, 1
    psrad  m1, 1
    psrad  m2, 1
    psrad  m3, 1
    mova [r0+ 0], m0
    mova [r0+16], m1
    mova [r0+32], m2
    mova [r0+48], m3
    RET
%endmacro ; DCT4x4_DC

INIT_XMM sse2
DCT4x4_DC
INIT_XMM avx
DCT4x4_DC
%else

INIT_MMX mmx
cglobal dct4x4dc, 1,1
    movq   m3, [r0+24]
    movq   m2, [r0+16]
    movq   m1, [r0+ 8]
    movq   m0, [r0+ 0]
    movq   m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
    WALSH4_1D  w, 0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4
    SUMSUB_BADC w, 1, 0, 3, 2, 4
    SWAP 0, 1
    SWAP 2, 3
    SUMSUB_17BIT 0,2,4,7
    SUMSUB_17BIT 1,3,5,7
    movq  [r0+0], m0
    movq  [r0+8], m2
    movq [r0+16], m3
    movq [r0+24], m1
    RET
%endif ; HIGH_BIT_DEPTH

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void idct4x4dc( int32_t d[4][4] )
;-----------------------------------------------------------------------------
%macro IDCT4x4DC 0
cglobal idct4x4dc, 1,1
    mova   m3, [r0+48]
    mova   m2, [r0+32]
    mova   m1, [r0+16]
    mova   m0, [r0+ 0]
    WALSH4_1D  d,0,1,2,3,4
    TRANSPOSE4x4D 0,1,2,3,4
    WALSH4_1D  d,0,1,2,3,4
    mova  [r0+ 0], m0
    mova  [r0+16], m1
    mova  [r0+32], m2
    mova  [r0+48], m3
    RET
%endmacro ; IDCT4x4DC

INIT_XMM sse2
IDCT4x4DC
INIT_XMM avx
IDCT4x4DC
%else

;-----------------------------------------------------------------------------
; void idct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
INIT_MMX mmx
cglobal idct4x4dc, 1,1
    movq   m3, [r0+24]
    movq   m2, [r0+16]
    movq   m1, [r0+ 8]
    movq   m0, [r0+ 0]
    WALSH4_1D  w,0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4
    WALSH4_1D  w,0,1,2,3,4
    movq  [r0+ 0], m0
    movq  [r0+ 8], m1
    movq  [r0+16], m2
    movq  [r0+24], m3
    RET
%endif ; HIGH_BIT_DEPTH

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
;-----------------------------------------------------------------------------
INIT_MMX mmx
cglobal sub4x4_dct, 3,3
.skip_prologue:
    LOAD_DIFF  m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
    LOAD_DIFF  m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
    LOAD_DIFF  m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
    LOAD_DIFF  m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
    DCT4_1D 0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4

    SUMSUB_BADC w, 3, 0, 2, 1
    SUMSUB_BA   w, 2, 3, 4
    DCT_UNPACK m2, m4, m5
    DCT_UNPACK m3, m6, m7
    mova  [r0+ 0], m2 ; s03 + s12
    mova  [r0+ 8], m4
    mova  [r0+32], m3 ; s03 - s12
    mova  [r0+40], m6

    DCT_UNPACK m0, m2, m4
    DCT_UNPACK m1, m3, m5
    SUMSUB2_AB  d, 0, 1, 4
    SUMSUB2_AB  d, 2, 3, 5
    mova  [r0+16], m0 ; d03*2 + d12
    mova  [r0+24], m2
    mova  [r0+48], m4 ; d03 - 2*d12
    mova  [r0+56], m5
    RET
%else

%macro SUB_DCT4 0
cglobal sub4x4_dct, 3,3
.skip_prologue:
%if cpuflag(ssse3)
    mova m5, [hsub_mul]
%endif
    LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
    DCT4_1D 0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4
    DCT4_1D 0,1,2,3,4
    movq  [r0+ 0], m0
    movq  [r0+ 8], m1
    movq  [r0+16], m2
    movq  [r0+24], m3
    RET
%endmacro

INIT_MMX mmx
SUB_DCT4
INIT_MMX ssse3
SUB_DCT4
%endif ; HIGH_BIT_DEPTH

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
;-----------------------------------------------------------------------------
%macro STORE_DIFFx2 6
    psrad     %1, 6
    psrad     %2, 6
    packssdw  %1, %2
    movq      %3, %5
    movhps    %3, %6
    paddsw    %1, %3
    CLIPW     %1, %4, [pw_pixel_max]
    movq      %5, %1
    movhps    %6, %1
%endmacro

%macro ADD4x4_IDCT 0
cglobal add4x4_idct, 2,2,6
    add   r0, 2*FDEC_STRIDEB
.skip_prologue:
    mova  m1, [r1+16]
    mova  m3, [r1+48]
    mova  m2, [r1+32]
    mova  m0, [r1+ 0]
    IDCT4_1D d,0,1,2,3,4,5
    TRANSPOSE4x4D 0,1,2,3,4
    paddd m0, [pd_32]
    IDCT4_1D d,0,1,2,3,4,5
    pxor  m5, m5
    STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
    STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
    RET
%endmacro

INIT_XMM sse2
ADD4x4_IDCT
INIT_XMM avx
ADD4x4_IDCT

%else ; !HIGH_BIT_DEPTH

INIT_MMX mmx
cglobal add4x4_idct, 2,2
    pxor m7, m7
.skip_prologue:
    movq  m1, [r1+ 8]
    movq  m3, [r1+24]
    movq  m2, [r1+16]
    movq  m0, [r1+ 0]
    IDCT4_1D w,0,1,2,3,4,5
    TRANSPOSE4x4W 0,1,2,3,4
    paddw m0, [pw_32]
    IDCT4_1D w,0,1,2,3,4,5
    STORE_DIFF  m0, m4, m7, [r0+0*FDEC_STRIDE]
    STORE_DIFF  m1, m4, m7, [r0+1*FDEC_STRIDE]
    STORE_DIFF  m2, m4, m7, [r0+2*FDEC_STRIDE]
    STORE_DIFF  m3, m4, m7, [r0+3*FDEC_STRIDE]
    RET

%macro ADD4x4 0
cglobal add4x4_idct, 2,2,6
    mova      m1, [r1+0x00]     ; row1/row0
    mova      m3, [r1+0x10]     ; row3/row2
    psraw     m0, m1, 1         ; row1>>1/...
    psraw     m2, m3, 1         ; row3>>1/...
    movsd     m0, m1            ; row1>>1/row0
    movsd     m2, m3            ; row3>>1/row2
    psubw     m0, m3            ; row1>>1-row3/row0-2
    paddw     m2, m1            ; row3>>1+row1/row0+2
    SBUTTERFLY2 wd, 0, 2, 1
    SUMSUB_BA w, 2, 0, 1
    pshuflw   m1, m2, q2301
    pshufhw   m2, m2, q2301
    punpckldq m1, m0
    punpckhdq m2, m0
    SWAP       0, 1

    mova      m1, [pw_32_0]
    paddw     m1, m0            ; row1/row0 corrected
    psraw     m0, 1             ; row1>>1/...
    psraw     m3, m2, 1         ; row3>>1/...
    movsd     m0, m1            ; row1>>1/row0
    movsd     m3, m2            ; row3>>1/row2
    psubw     m0, m2            ; row1>>1-row3/row0-2
    paddw     m3, m1            ; row3>>1+row1/row0+2
    SBUTTERFLY2 qdq, 0, 3, 1
    SUMSUB_BA w, 3, 0, 1

    movd      m4, [r0+FDEC_STRIDE*0]
    movd      m1, [r0+FDEC_STRIDE*1]
    movd      m2, [r0+FDEC_STRIDE*2]
    movd      m5, [r0+FDEC_STRIDE*3]
    punpckldq m1, m4            ; row0/row1
    pxor      m4, m4
    punpckldq m2, m5            ; row3/row2
    punpcklbw m1, m4
    psraw     m3, 6
    punpcklbw m2, m4
    psraw     m0, 6
    paddsw    m3, m1
    paddsw    m0, m2
    packuswb  m0, m3            ; row0/row1/row3/row2
    pextrd   [r0+FDEC_STRIDE*0], m0, 3
    pextrd   [r0+FDEC_STRIDE*1], m0, 2
    movd     [r0+FDEC_STRIDE*2], m0
    pextrd   [r0+FDEC_STRIDE*3], m0, 1
    RET
%endmacro ; ADD4x4

INIT_XMM sse4
ADD4x4
INIT_XMM avx
ADD4x4
%endif ; HIGH_BIT_DEPTH

INIT_MMX
;-----------------------------------------------------------------------------
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal %1, 3,3,11
%ifndef HIGH_BIT_DEPTH
%if mmsize == 8
    pxor m7, m7
%else
    add r2, 4*FDEC_STRIDE
    mova m7, [hsub_mul]
%endif
%endif ; !HIGH_BIT_DEPTH
.skip_prologue:
%ifdef WIN64
    sub  rsp, 8
%endif
    call %2.skip_prologue
    add  r0, %3
    add  r1, %4-%5-%6*FENC_STRIDE
    add  r2, %4-%5-%6*FDEC_STRIDE
    call %2.skip_prologue
    add  r0, %3
    add  r1, (%4-%6)*FENC_STRIDE-%5-%4
    add  r2, (%4-%6)*FDEC_STRIDE-%5-%4
    call %2.skip_prologue
    add  r0, %3
    add  r1, %4-%5-%6*FENC_STRIDE
    add  r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
    call %2.skip_prologue
    add  rsp, 8
    RET
%else
    jmp  %2.skip_prologue
%endif
%endmacro

;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
%ifdef HIGH_BIT_DEPTH
cglobal %1, 2,2,6
%else
cglobal %1, 2,2,11
    pxor m7, m7
%endif
%if mmsize==16
    add  r0, 4*FDEC_STRIDE
%endif
.skip_prologue:
%ifdef WIN64
    sub  rsp, 8
%endif
    call %2.skip_prologue
    add  r0, %4-%5-%6*FDEC_STRIDE
    add  r1, %3
    call %2.skip_prologue
    add  r0, (%4-%6)*FDEC_STRIDE-%5-%4
    add  r1, %3
    call %2.skip_prologue
    add  r0, %4-%5-%6*FDEC_STRIDE
    add  r1, %3
%ifdef WIN64
    call %2.skip_prologue
    add  rsp, 8
    RET
%else
    jmp  %2.skip_prologue
%endif
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_MMX
SUB_NxN_DCT  sub8x8_dct_mmx,     sub4x4_dct_mmx,   64,  8, 0, 0
SUB_NxN_DCT  sub16x16_dct_mmx,   sub8x8_dct_mmx,   64, 16, 8, 8
INIT_XMM
ADD_NxN_IDCT add8x8_idct_sse2,   add4x4_idct_sse2, 64,  8, 0, 0
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8
ADD_NxN_IDCT add8x8_idct_avx,    add4x4_idct_avx,  64,  8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx,  add8x8_idct_avx,  64, 16, 8, 8
%else ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
INIT_MMX
SUB_NxN_DCT  sub8x8_dct_mmx,     sub4x4_dct_mmx,   32, 4, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx,    add4x4_idct_mmx,  32, 4, 0, 0
SUB_NxN_DCT  sub16x16_dct_mmx,   sub8x8_dct_mmx,   32, 8, 4, 4
ADD_NxN_IDCT add16x16_idct_mmx,  add8x8_idct_mmx,  32, 8, 4, 4

cextern sub8x8_dct8_mmx.skip_prologue
cextern add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT  sub16x16_dct8_mmx,  sub8x8_dct8_mmx,  128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
%endif

INIT_XMM
cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
cextern sub8x8_dct_avx.skip_prologue
SUB_NxN_DCT  sub16x16_dct_sse2,  sub8x8_dct_sse2,  128, 8, 0, 0
SUB_NxN_DCT  sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0
SUB_NxN_DCT  sub16x16_dct_avx,   sub8x8_dct_avx,   128, 8, 0, 0

cextern add8x8_idct_sse2.skip_prologue
cextern add8x8_idct_avx.skip_prologue
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx,  add8x8_idct_avx,  128, 8, 0, 0

cextern add8x8_idct8_sse2.skip_prologue
cextern add8x8_idct8_avx.skip_prologue
ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct8_avx,  add8x8_idct8_avx,  128, 8, 0, 0

cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_ssse3.skip_prologue
cextern sub8x8_dct8_avx.skip_prologue
SUB_NxN_DCT  sub16x16_dct8_sse2,  sub8x8_dct8_sse2,  128, 8, 0, 0
SUB_NxN_DCT  sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0
SUB_NxN_DCT  sub16x16_dct8_avx,   sub8x8_dct8_avx,   128, 8, 0, 0
%endif ; HIGH_BIT_DEPTH

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
;-----------------------------------------------------------------------------
%macro ADD_DC 2
    mova    m0, [%1+FDEC_STRIDEB*0] ; 8pixels
    mova    m1, [%1+FDEC_STRIDEB*1]
    mova    m2, [%1+FDEC_STRIDEB*2]
    paddsw  m0, %2
    paddsw  m1, %2
    paddsw  m2, %2
    paddsw  %2, [%1+FDEC_STRIDEB*3]
    CLIPW   m0, m5, m6
    CLIPW   m1, m5, m6
    CLIPW   m2, m5, m6
    CLIPW   %2, m5, m6
    mova    [%1+FDEC_STRIDEB*0], m0
    mova    [%1+FDEC_STRIDEB*1], m1
    mova    [%1+FDEC_STRIDEB*2], m2
    mova    [%1+FDEC_STRIDEB*3], %2
%endmacro

%macro ADD_IDCT_DC 0
cglobal add8x8_idct_dc, 2,2,7
    mova        m6, [pw_pixel_max]
    pxor        m5, m5
    mova        m3, [r1]
    paddd       m3, [pd_32]
    psrad       m3, 6         ; dc0   0 dc1   0 dc2   0 dc3   0
    pshuflw     m4, m3, q2200 ; dc0 dc0 dc1 dc1   _   _   _   _
    pshufhw     m3, m3, q2200 ;   _   _   _   _ dc2 dc2 dc3 dc3
    pshufd      m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
    pshufd      m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
    ADD_DC r0+FDEC_STRIDEB*0, m4
    ADD_DC r0+FDEC_STRIDEB*4, m3
    RET

cglobal add16x16_idct_dc, 2,3,8
    mov         r2, 4
    mova        m6, [pw_pixel_max]
    mova        m7, [pd_32]
    pxor        m5, m5
.loop
    mova        m3, [r1]
    paddd       m3, m7
    psrad       m3, 6         ; dc0   0 dc1   0 dc2   0 dc3   0
    pshuflw     m4, m3, q2200 ; dc0 dc0 dc1 dc1   _   _   _   _
    pshufhw     m3, m3, q2200 ;   _   _   _   _ dc2 dc2 dc3 dc3
    pshufd      m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
    pshufd      m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
    ADD_DC r0+FDEC_STRIDEB*0, m4
    ADD_DC r0+SIZEOF_PIXEL*8, m3
    add         r1, 16
    add         r0, 4*FDEC_STRIDEB
    dec         r2
    jg .loop
    REP_RET
%endmacro ; ADD_IDCT_DC

INIT_XMM sse2
ADD_IDCT_DC
INIT_XMM avx
ADD_IDCT_DC

%else ;!HIGH_BIT_DEPTH
%macro ADD_DC 3
    movq      mm4, [%3+FDEC_STRIDE*0]
    movq      mm5, [%3+FDEC_STRIDE*1]
    movq      mm6, [%3+FDEC_STRIDE*2]
    paddusb   mm4, %1
    paddusb   mm5, %1
    paddusb   mm6, %1
    paddusb    %1, [%3+FDEC_STRIDE*3]
    psubusb   mm4, %2
    psubusb   mm5, %2
    psubusb   mm6, %2
    psubusb    %1, %2
    movq      [%3+FDEC_STRIDE*0], mm4
    movq      [%3+FDEC_STRIDE*1], mm5
    movq      [%3+FDEC_STRIDE*2], mm6
    movq      [%3+FDEC_STRIDE*3], %1
%endmacro

INIT_MMX
cglobal add8x8_idct_dc_mmx, 2,2
    movq      mm0, [r1]
    pxor      mm1, mm1
    add        r0, FDEC_STRIDE*4
    paddw     mm0, [pw_32]
    psraw     mm0, 6
    psubw     mm1, mm0
    packuswb  mm0, mm0
    packuswb  mm1, mm1
    punpcklbw mm0, mm0
    punpcklbw mm1, mm1
    pshufw    mm2, mm0, q3322
    pshufw    mm3, mm1, q3322
    punpcklbw mm0, mm0
    punpcklbw mm1, mm1
    ADD_DC    mm0, mm1, r0-FDEC_STRIDE*4
    ADD_DC    mm2, mm3, r0
    RET

cglobal add8x8_idct_dc_ssse3, 2,2
    movq      xmm0, [r1]
    pxor      xmm1, xmm1
    add         r0, FDEC_STRIDE*4
    paddw     xmm0, [pw_32]
    psraw     xmm0, 6
    psubw     xmm1, xmm0
    movdqa    xmm5, [pb_idctdc_unpack]
    packuswb  xmm0, xmm0
    packuswb  xmm1, xmm1
    pshufb    xmm0, xmm5
    pshufb    xmm1, xmm5
    movq      xmm2, [r0+FDEC_STRIDE*-4]
    movq      xmm3, [r0+FDEC_STRIDE*-3]
    movq      xmm4, [r0+FDEC_STRIDE*-2]
    movq      xmm5, [r0+FDEC_STRIDE*-1]
    movhps    xmm2, [r0+FDEC_STRIDE* 0]
    movhps    xmm3, [r0+FDEC_STRIDE* 1]
    movhps    xmm4, [r0+FDEC_STRIDE* 2]
    movhps    xmm5, [r0+FDEC_STRIDE* 3]
    paddusb   xmm2, xmm0
    paddusb   xmm3, xmm0
    paddusb   xmm4, xmm0
    paddusb   xmm5, xmm0
    psubusb   xmm2, xmm1
    psubusb   xmm3, xmm1
    psubusb   xmm4, xmm1
    psubusb   xmm5, xmm1
    movq      [r0+FDEC_STRIDE*-4], xmm2
    movq      [r0+FDEC_STRIDE*-3], xmm3
    movq      [r0+FDEC_STRIDE*-2], xmm4
    movq      [r0+FDEC_STRIDE*-1], xmm5
    movhps    [r0+FDEC_STRIDE* 0], xmm2
    movhps    [r0+FDEC_STRIDE* 1], xmm3
    movhps    [r0+FDEC_STRIDE* 2], xmm4
    movhps    [r0+FDEC_STRIDE* 3], xmm5
    RET

cglobal add16x16_idct_dc_mmx, 2,3
    mov       r2, 4
.loop:
    movq      mm0, [r1]
    pxor      mm1, mm1
    paddw     mm0, [pw_32]
    psraw     mm0, 6
    psubw     mm1, mm0
    packuswb  mm0, mm0
    packuswb  mm1, mm1
    punpcklbw mm0, mm0
    punpcklbw mm1, mm1
    pshufw    mm2, mm0, q3322
    pshufw    mm3, mm1, q3322
    punpcklbw mm0, mm0
    punpcklbw mm1, mm1
    ADD_DC    mm0, mm1, r0
    ADD_DC    mm2, mm3, r0+8
    add       r1, 8
    add       r0, FDEC_STRIDE*4
    dec       r2
    jg .loop
    REP_RET

%macro IDCT_DC_STORE 3
    movdqa    xmm4, [r0+%1+FDEC_STRIDE*0]
    movdqa    xmm5, [r0+%1+FDEC_STRIDE*1]
    movdqa    xmm6, [r0+%1+FDEC_STRIDE*2]
    movdqa    xmm7, [r0+%1+FDEC_STRIDE*3]
    paddusb   xmm4, %2
    paddusb   xmm5, %2
    paddusb   xmm6, %2
    paddusb   xmm7, %2
    psubusb   xmm4, %3
    psubusb   xmm5, %3
    psubusb   xmm6, %3
    psubusb   xmm7, %3
    movdqa    [r0+%1+FDEC_STRIDE*0], xmm4
    movdqa    [r0+%1+FDEC_STRIDE*1], xmm5
    movdqa    [r0+%1+FDEC_STRIDE*2], xmm6
    movdqa    [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro

INIT_XMM
cglobal add16x16_idct_dc_sse2, 2,2,8
    call .loop
    add       r0, FDEC_STRIDE*4
%ifdef WIN64
    call .loop
    RET
%endif
.loop:
    add       r0, FDEC_STRIDE*4
    movq      xmm0, [r1+0]
    movq      xmm2, [r1+8]
    add       r1, 16
    punpcklwd xmm0, xmm0
    punpcklwd xmm2, xmm2
    pxor      xmm3, xmm3
    paddw     xmm0, [pw_32]
    paddw     xmm2, [pw_32]
    psraw     xmm0, 6
    psraw     xmm2, 6
    psubw     xmm1, xmm3, xmm0
    packuswb  xmm0, xmm1
    psubw     xmm3, xmm2
    punpckhbw xmm1, xmm0, xmm0
    packuswb  xmm2, xmm3
    punpckhbw xmm3, xmm2, xmm2
    punpcklbw xmm0, xmm0
    punpcklbw xmm2, xmm2
    IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
    IDCT_DC_STORE 0, xmm2, xmm3
    ret

%macro ADD16x16 0
cglobal add16x16_idct_dc, 2,2,8
    call .loop
    add       r0, FDEC_STRIDE*4
%ifdef WIN64
    call .loop
    RET
%endif
.loop:
    add       r0, FDEC_STRIDE*4
    movdqa    xmm0, [r1]
    add       r1, 16
    pxor      xmm1, xmm1
    paddw     xmm0, [pw_32]
    psraw     xmm0, 6
    psubw     xmm1, xmm0
    movdqa    xmm5, [ pb_idctdc_unpack]
    movdqa    xmm6, [pb_idctdc_unpack2]
    packuswb  xmm0, xmm0
    packuswb  xmm1, xmm1
    pshufb    xmm2, xmm0, xmm6
    pshufb    xmm0, xmm5
    pshufb    xmm3, xmm1, xmm6
    pshufb    xmm1, xmm5
    IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
    IDCT_DC_STORE 0, xmm2, xmm3
    ret
%endmacro ; ADD16x16

INIT_XMM ssse3
ADD16x16
INIT_XMM avx
ADD16x16

%endif ; HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------

%macro DCTDC_2ROW_MMX 3
    movq      %1, [r1+FENC_STRIDE*(0+%3)]
    movq      m1, [r1+FENC_STRIDE*(1+%3)]
    movq      m2, [r2+FDEC_STRIDE*(0+%3)]
    movq      m3, [r2+FDEC_STRIDE*(1+%3)]
    movq      %2, %1
    punpckldq %1, m1
    punpckhdq %2, m1
    movq      m1, m2
    punpckldq m2, m3
    punpckhdq m1, m3
    pxor      m3, m3
    psadbw    %1, m3
    psadbw    %2, m3
    psadbw    m2, m3
    psadbw    m1, m3
    psubw     %1, m2
    psubw     %2, m1
%endmacro

%macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
    pshufw    mm1, %1, q2200  ;  s1  s1  s0  s0
    pshufw    mm0, %2, q2301  ;  s3  __  s2  __
    paddw     mm1, %2         ;  s1 s13  s0 s02
    psubw     mm1, mm0        ; d13 s13 d02 s02
    pshufw    mm0, mm1, q1010 ; d02 s02 d02 s02
    psrlq     mm1, 32         ;  __  __ d13 s13
    paddw     mm0, mm1        ; d02 s02 d02+d13 s02+s13
    psllq     mm1, 32         ; d13 s13
    psubw     mm0, mm1        ; d02-d13 s02-s13 d02+d13 s02+s13
%endmacro

%ifndef HIGH_BIT_DEPTH
INIT_MMX
cglobal sub8x8_dct_dc_mmx2, 3,3
    DCTDC_2ROW_MMX m0, m4, 0
    DCTDC_2ROW_MMX m5, m6, 2
    paddw     m0, m5
    paddw     m4, m6
    punpckldq m0, m4
    add       r1, FENC_STRIDE*4
    add       r2, FDEC_STRIDE*4
    DCTDC_2ROW_MMX m7, m4, 0
    DCTDC_2ROW_MMX m5, m6, 2
    paddw     m7, m5
    paddw     m4, m6
    punpckldq m7, m4
    DCT2x2    m0, m7
    movq    [r0], m0
    ret

INIT_XMM
%macro DCTDC_2ROW_SSE2 3
    movq      m0, [r1+FENC_STRIDE*(0+%1)]
    movq      m1, [r1+FENC_STRIDE*(1+%1)]
    movq      m2, [r2+FDEC_STRIDE*(0+%1)]
    movq      m3, [r2+FDEC_STRIDE*(1+%1)]
    punpckldq m0, m1
    punpckldq m2, m3
    psadbw    m0, m7
    psadbw    m2, m7
%if %2
    paddw     %3, m0
    paddw     m6, m2
%else
    SWAP      %3, m0
    SWAP      m6, m2
%endif
%endmacro

cglobal sub8x8_dct_dc_sse2, 3,3,8
    pxor     m7, m7
    DCTDC_2ROW_SSE2 0, 0, m4
    DCTDC_2ROW_SSE2 2, 1, m4
    add      r1, FENC_STRIDE*4
    add      r2, FDEC_STRIDE*4
    psubd    m4, m6
    DCTDC_2ROW_SSE2 0, 0, m5
    DCTDC_2ROW_SSE2 2, 1, m5
    psubd    m5, m6
    packssdw m4, m5
    movhlps  m5, m4
    movdq2q mm0, m4
    movdq2q mm7, m5
    DCT2x2  mm0, mm7
    movq   [r0], mm0
    RET
%endif ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 0
cglobal zigzag_scan_8x8_frame, 2,2,8
    movdqa    xmm0, [r1]
    movdqa    xmm1, [r1+16]
    movdq2q    mm0, xmm0
    PALIGNR   xmm1, xmm1, 14, xmm2
    movdq2q    mm1, xmm1

    movdqa    xmm2, [r1+32]
    movdqa    xmm3, [r1+48]
    PALIGNR   xmm2, xmm2, 12, xmm4
    movdq2q    mm2, xmm2
    PALIGNR   xmm3, xmm3, 10, xmm4
    movdq2q    mm3, xmm3

    punpckhwd xmm0, xmm1
    punpckhwd xmm2, xmm3

    movq       mm4, mm1
    movq       mm5, mm1
    movq       mm6, mm2
    movq       mm7, mm3
    punpckhwd  mm1, mm0
    psllq      mm0, 16
    psrlq      mm3, 16
    punpckhdq  mm1, mm1
    punpckhdq  mm2, mm0
    punpcklwd  mm0, mm4
    punpckhwd  mm4, mm3
    punpcklwd  mm4, mm2
    punpckhdq  mm0, mm2
    punpcklwd  mm6, mm3
    punpcklwd  mm5, mm7
    punpcklwd  mm5, mm6

    movdqa    xmm4, [r1+64]
    movdqa    xmm5, [r1+80]
    movdqa    xmm6, [r1+96]
    movdqa    xmm7, [r1+112]

    movq [r0+2*00], mm0
    movq [r0+2*04], mm4
    movd [r0+2*08], mm1
    movq [r0+2*36], mm5
    movq [r0+2*46], mm6

    PALIGNR   xmm4, xmm4, 14, xmm3
    movdq2q    mm4, xmm4
    PALIGNR   xmm5, xmm5, 12, xmm3
    movdq2q    mm5, xmm5
    PALIGNR   xmm6, xmm6, 10, xmm3
    movdq2q    mm6, xmm6
%if cpuflag(ssse3)
    PALIGNR   xmm7, xmm7, 8, xmm3
    movdq2q    mm7, xmm7
%else
    movhlps   xmm3, xmm7
    punpcklqdq xmm7, xmm7
    movdq2q    mm7, xmm3
%endif

    punpckhwd xmm4, xmm5
    punpckhwd xmm6, xmm7

    movq       mm0, mm4
    movq       mm1, mm5
    movq       mm3, mm7
    punpcklwd  mm7, mm6
    psrlq      mm6, 16
    punpcklwd  mm4, mm6
    punpcklwd  mm5, mm4
    punpckhdq  mm4, mm3
    punpcklwd  mm3, mm6
    punpckhwd  mm3, mm4
    punpckhwd  mm0, mm1
    punpckldq  mm4, mm0
    punpckhdq  mm0, mm6
    pshufw     mm4, mm4, q1230

    movq [r0+2*14], mm4
    movq [r0+2*25], mm0
    movd [r0+2*54], mm7
    movq [r0+2*56], mm5
    movq [r0+2*60], mm3

    punpckhdq xmm3, xmm0, xmm2
    punpckldq xmm0, xmm2
    punpckhdq xmm7, xmm4, xmm6
    punpckldq xmm4, xmm6
    pshufhw   xmm0, xmm0, q0123
    pshuflw   xmm4, xmm4, q0123
    pshufhw   xmm3, xmm3, q0123
    pshuflw   xmm7, xmm7, q0123

    movlps [r0+2*10], xmm0
    movhps [r0+2*17], xmm0
    movlps [r0+2*21], xmm3
    movlps [r0+2*28], xmm4
    movhps [r0+2*32], xmm3
    movhps [r0+2*39], xmm4
    movlps [r0+2*43], xmm7
    movhps [r0+2*50], xmm7

    RET
%endmacro

%ifndef HIGH_BIT_DEPTH
INIT_XMM sse2
SCAN_8x8
INIT_XMM ssse3
SCAN_8x8
%endif

;-----------------------------------------------------------------------------
; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8_FRAME 5
cglobal zigzag_scan_8x8_frame, 2,2,8
    mova        m0, [r1]
    mova        m1, [r1+ 8*SIZEOF_DCTCOEF]
    movu        m2, [r1+14*SIZEOF_DCTCOEF]
    movu        m3, [r1+21*SIZEOF_DCTCOEF]
    mova        m4, [r1+28*SIZEOF_DCTCOEF]
    punpckl%4   m5, m0, m1
    psrl%2      m0, %1
    punpckh%4   m6, m1, m0
    punpckl%3   m5, m0
    punpckl%3   m1, m1
    punpckh%4   m1, m3
    mova        m7, [r1+52*SIZEOF_DCTCOEF]
    mova        m0, [r1+60*SIZEOF_DCTCOEF]
    punpckh%4   m1, m2
    punpckl%4   m2, m4
    punpckh%4   m4, m3
    punpckl%3   m3, m3
    punpckh%4   m3, m2
    mova      [r0], m5
    mova  [r0+ 4*SIZEOF_DCTCOEF], m1
    mova  [r0+ 8*SIZEOF_DCTCOEF], m6
    punpckl%4   m6, m0
    punpckl%4   m6, m7
    mova        m1, [r1+32*SIZEOF_DCTCOEF]
    movu        m5, [r1+39*SIZEOF_DCTCOEF]
    movu        m2, [r1+46*SIZEOF_DCTCOEF]
    movu [r0+35*SIZEOF_DCTCOEF], m3
    movu [r0+47*SIZEOF_DCTCOEF], m4
    punpckh%4   m7, m0
    psll%2      m0, %1
    punpckh%3   m3, m5, m5
    punpckl%4   m5, m1
    punpckh%4   m1, m2
    mova [r0+52*SIZEOF_DCTCOEF], m6
    movu [r0+13*SIZEOF_DCTCOEF], m5
    movu        m4, [r1+11*SIZEOF_DCTCOEF]
    movu        m6, [r1+25*SIZEOF_DCTCOEF]
    punpckl%4   m5, m7
    punpckl%4   m1, m3
    punpckh%3   m0, m7
    mova        m3, [r1+ 4*SIZEOF_DCTCOEF]
    movu        m7, [r1+18*SIZEOF_DCTCOEF]
    punpckl%4   m2, m5
    movu [r0+25*SIZEOF_DCTCOEF], m1
    mova        m1, m4
    mova        m5, m6
    punpckl%4   m4, m3
    punpckl%4   m6, m7
    punpckh%4   m1, m3
    punpckh%4   m5, m7
    punpckh%3   m3, m6, m4
    punpckh%3   m7, m5, m1
    punpckl%3   m6, m4
    punpckl%3   m5, m1
    movu        m4, [r1+35*SIZEOF_DCTCOEF]
    movu        m1, [r1+49*SIZEOF_DCTCOEF]
    pshuf%5     m6, m6, q0123
    pshuf%5     m5, m5, q0123
    mova [r0+60*SIZEOF_DCTCOEF], m0
    mova [r0+56*SIZEOF_DCTCOEF], m2
    movu        m0, [r1+42*SIZEOF_DCTCOEF]
    mova        m2, [r1+56*SIZEOF_DCTCOEF]
    movu [r0+17*SIZEOF_DCTCOEF], m3
    mova [r0+32*SIZEOF_DCTCOEF], m7
    movu [r0+10*SIZEOF_DCTCOEF], m6
    movu [r0+21*SIZEOF_DCTCOEF], m5
    punpckh%4   m3, m0, m4
    punpckh%4   m7, m2, m1
    punpckl%4   m0, m4
    punpckl%4   m2, m1
    punpckl%3   m4, m2, m0
    punpckl%3   m1, m7, m3
    punpckh%3   m2, m0
    punpckh%3   m7, m3
    pshuf%5     m2, m2, q0123
    pshuf%5     m7, m7, q0123
    mova [r0+28*SIZEOF_DCTCOEF], m4
    movu [r0+43*SIZEOF_DCTCOEF], m1
    movu [r0+39*SIZEOF_DCTCOEF], m2
    movu [r0+50*SIZEOF_DCTCOEF], m7
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
SCAN_8x8_FRAME 4 , dq, qdq, dq, d
INIT_XMM avx
SCAN_8x8_FRAME 4 , dq, qdq, dq, d
%else
INIT_MMX mmx2
SCAN_8x8_FRAME 16, q , dq , wd, w
%endif

;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
;-----------------------------------------------------------------------------
%macro SCAN_4x4 4
cglobal zigzag_scan_4x4_frame, 2,2,8*(mmsize)/16
    mova       m0, [r1]
    mova       m1, [r1+ 4*SIZEOF_DCTCOEF]
    mova       m2, [r1+ 8*SIZEOF_DCTCOEF]
    mova       m3, [r1+12*SIZEOF_DCTCOEF]
    punpckl%4  m4, m0, m1
    mova       m5, m1
    mova       m6, m2
    mova       m7, m3
    psll%2     m3, %1
    psrl%2     m0, %1
    punpckl%3  m2, m2
    punpckh%3  m1, m1
    punpckl%4  m5, m3
    punpckl%3  m4, m0
    punpckh%4  m5, m2
    punpckh%4  m0, m6
    punpckh%4  m6, m7
    punpckl%4  m1, m0
    punpckh%3  m3, m6
    mova     [r0], m4
    mova  [r0+ 4*SIZEOF_DCTCOEF], m5
    mova  [r0+ 8*SIZEOF_DCTCOEF], m1
    mova  [r0+12*SIZEOF_DCTCOEF], m3
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
SCAN_4x4 4 , dq, qdq, dq
INIT_XMM avx
SCAN_4x4 4 , dq, qdq, dq
%else
INIT_MMX mmx
SCAN_4x4 16, q , dq , wd

;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
%macro SCAN_4x4_FRAME 0
cglobal zigzag_scan_4x4_frame, 2,2
    movdqa    xmm1, [r1+16]
    movdqa    xmm0, [r1]
    pshufb    xmm1, [pb_scan4frameb]
    pshufb    xmm0, [pb_scan4framea]
    psrldq    xmm2, xmm1, 6
    palignr   xmm1, xmm0, 6
    pslldq    xmm0, 10
    palignr   xmm2, xmm0, 10
    movdqa    [r0], xmm1
    movdqa [r0+16], xmm2
    RET
%endmacro

INIT_XMM ssse3
SCAN_4x4_FRAME
INIT_XMM avx
SCAN_4x4_FRAME

INIT_XMM xop
cglobal zigzag_scan_4x4_frame, 2,2
    mova   m0, [r1+ 0]
    mova   m1, [r1+16]
    vpperm m2, m0, m1, [pb_scan4frame2a]
    vpperm m1, m0, m1, [pb_scan4frame2b]
    mova [r0+ 0], m2
    mova [r0+16], m1
    RET
%endif ; !HIGH_BIT_DEPTH

%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal zigzag_scan_4x4_field_sse2, 2,3
    movu       m4, [r1+8]
    pshufd     m0, m4, q3102
    mova       m1, [r1+32]
    mova       m2, [r1+48]
    movu   [r0+8], m0
    mova  [r0+32], m1
    mova  [r0+48], m2
    movq      mm0, [r1]
    movq     [r0], mm0
    movq      mm0, [r1+24]
    movq  [r0+24], mm0
    RET
%else
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
INIT_MMX
cglobal zigzag_scan_4x4_field_mmx2, 2,3
    pshufw     mm0, [r1+4], q3102
    movq       mm1, [r1+16]
    movq       mm2, [r1+24]
    movq    [r0+4], mm0
    movq   [r0+16], mm1
    movq   [r0+24], mm2
    mov        r2d, [r1]
    mov       [r0], r2d
    mov        r2d, [r1+12]
    mov    [r0+12], r2d
    RET
%endif ; HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
; Output order:
;  0  1  2  8  9  3  4 10
; 16 11  5  6  7 12 17 24
; 18 13 14 15 19 25 32 26
; 20 21 22 23 27 33 40 34
; 28 29 30 31 35 41 48 42
; 36 37 38 39 43 49 50 44
; 45 46 47 51 56 57 52 53
; 54 55 58 59 60 61 62 63
%undef SCAN_8x8
%macro SCAN_8x8 5
cglobal zigzag_scan_8x8_field, 2,3,8
    mova       m0, [r1+ 0*SIZEOF_DCTCOEF]       ; 03 02 01 00
    mova       m1, [r1+ 4*SIZEOF_DCTCOEF]       ; 07 06 05 04
    mova       m2, [r1+ 8*SIZEOF_DCTCOEF]       ; 11 10 09 08
    pshuf%1    m3, m0, q3333                    ; 03 03 03 03
    movd       r2, m2                           ; 09 08
    pshuf%1    m2, m2, q0321                    ; 08 11 10 09
    punpckl%2  m3, m1                           ; 05 03 04 03
    pinsr%1    m0, r2d, 3                       ; 08 02 01 00
    punpckl%2  m4, m2, m3                       ; 04 10 03 09
    pshuf%1    m4, m4, q2310                    ; 10 04 03 09
    mova  [r0+ 0*SIZEOF_DCTCOEF], m0            ; 08 02 01 00
    mova  [r0+ 4*SIZEOF_DCTCOEF], m4            ; 10 04 03 09
    mova       m3, [r1+12*SIZEOF_DCTCOEF]       ; 15 14 13 12
    mova       m5, [r1+16*SIZEOF_DCTCOEF]       ; 19 18 17 16
    punpckl%3  m6, m5                           ; 17 16 XX XX
    psrl%4     m1, %5                           ; XX 07 06 05
    punpckh%2  m6, m2                           ; 08 17 11 16
    punpckl%3  m6, m1                           ; 06 05 11 16
    mova  [r0+ 8*SIZEOF_DCTCOEF], m6            ; 06 05 11 16
    psrl%4     m1, %5                           ; XX XX 07 06
    punpckl%2  m1, m5                           ; 17 07 16 06
    mova       m0, [r1+20*SIZEOF_DCTCOEF]       ; 23 22 21 20
    mova       m2, [r1+24*SIZEOF_DCTCOEF]       ; 27 26 25 24
    punpckh%3  m1, m1                           ; 17 07 17 07
    punpckl%2  m6, m3, m2                       ; 25 13 24 12
    pextr%1    r2d, m5, 2
    mova [r0+24*SIZEOF_DCTCOEF], m0             ; 23 22 21 20
    punpckl%2  m1, m6                           ; 24 17 12 07
    mova [r0+12*SIZEOF_DCTCOEF], m1
    pinsr%1    m3, r2d, 0                       ; 15 14 13 18
    mova [r0+16*SIZEOF_DCTCOEF], m3             ; 15 14 13 18
    mova       m7, [r1+28*SIZEOF_DCTCOEF]
    mova       m0, [r1+32*SIZEOF_DCTCOEF]       ; 35 34 33 32
    psrl%4     m5, %5*3                         ; XX XX XX 19
    pshuf%1    m1, m2, q3321                    ; 27 27 26 25
    punpckl%2  m5, m0                           ; 33 XX 32 19
    psrl%4     m2, %5*3                         ; XX XX XX 27
    punpckl%2  m5, m1                           ; 26 32 25 19
    mova [r0+32*SIZEOF_DCTCOEF], m7
    mova [r0+20*SIZEOF_DCTCOEF], m5             ; 26 32 25 19
    mova       m7, [r1+36*SIZEOF_DCTCOEF]
    mova       m1, [r1+40*SIZEOF_DCTCOEF]       ; 43 42 41 40
    pshuf%1    m3, m0, q3321                    ; 35 35 34 33
    punpckl%2  m2, m1                           ; 41 XX 40 27
    mova [r0+40*SIZEOF_DCTCOEF], m7
    punpckl%2  m2, m3                           ; 34 40 33 27
    mova [r0+28*SIZEOF_DCTCOEF], m2
    mova       m7, [r1+44*SIZEOF_DCTCOEF]       ; 47 46 45 44
    mova       m2, [r1+48*SIZEOF_DCTCOEF]       ; 51 50 49 48
    psrl%4     m0, %5*3                         ; XX XX XX 35
    punpckl%2  m0, m2                           ; 49 XX 48 35
    pshuf%1    m3, m1, q3321                    ; 43 43 42 41
    punpckl%2  m0, m3                           ; 42 48 41 35
    mova [r0+36*SIZEOF_DCTCOEF], m0
    pextr%1     r2d, m2, 3                      ; 51
    psrl%4      m1, %5*3                        ; XX XX XX 43
    punpckl%2   m1, m7                          ; 45 XX 44 43
    psrl%4      m2, %5                          ; XX 51 50 49
    punpckl%2   m1, m2                          ; 50 44 49 43
    pshuf%1     m1, m1, q2310                   ; 44 50 49 43
    mova [r0+44*SIZEOF_DCTCOEF], m1
    psrl%4      m7, %5                          ; XX 47 46 45
    pinsr%1     m7, r2d, 3                      ; 51 47 46 45
    mova [r0+48*SIZEOF_DCTCOEF], m7
    mova        m0, [r1+56*SIZEOF_DCTCOEF]      ; 59 58 57 56
    mova        m1, [r1+52*SIZEOF_DCTCOEF]      ; 55 54 53 52
    mova        m7, [r1+60*SIZEOF_DCTCOEF]
    punpckl%3   m2, m0, m1                      ; 53 52 57 56
    punpckh%3   m1, m0                          ; 59 58 55 54
    mova [r0+52*SIZEOF_DCTCOEF], m2
    mova [r0+56*SIZEOF_DCTCOEF], m1
    mova [r0+60*SIZEOF_DCTCOEF], m7
    RET
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse4
SCAN_8x8 d, dq, qdq, dq, 4
INIT_XMM avx
SCAN_8x8 d, dq, qdq, dq, 4
%else
INIT_MMX mmx2
SCAN_8x8 w, wd, dq , q , 16
%endif

;-----------------------------------------------------------------------------
; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
%macro ZIGZAG_SUB_4x4 2
%ifidn %1, ac
cglobal zigzag_sub_4x4%1_%2, 4,4,8
%else
cglobal zigzag_sub_4x4%1_%2, 3,3,8
%endif
    movd      xmm0, [r1+0*FENC_STRIDE]
    movd      xmm1, [r1+1*FENC_STRIDE]
    movd      xmm2, [r1+2*FENC_STRIDE]
    movd      xmm3, [r1+3*FENC_STRIDE]
    movd      xmm4, [r2+0*FDEC_STRIDE]
    movd      xmm5, [r2+1*FDEC_STRIDE]
    movd      xmm6, [r2+2*FDEC_STRIDE]
    movd      xmm7, [r2+3*FDEC_STRIDE]
    movd      [r2+0*FDEC_STRIDE], xmm0
    movd      [r2+1*FDEC_STRIDE], xmm1
    movd      [r2+2*FDEC_STRIDE], xmm2
    movd      [r2+3*FDEC_STRIDE], xmm3
    punpckldq xmm0, xmm1
    punpckldq xmm2, xmm3
    punpckldq xmm4, xmm5
    punpckldq xmm6, xmm7
    punpcklqdq xmm0, xmm2
    punpcklqdq xmm4, xmm6
%ifidn %2, frame
    movdqa    xmm7, [pb_sub4frame]
%else
    movdqa    xmm7, [pb_sub4field]
%endif
    pshufb    xmm0, xmm7
    pshufb    xmm4, xmm7
    pxor      xmm6, xmm6
    punpckhbw xmm1, xmm0, xmm6
    punpckhbw xmm5, xmm4, xmm6
    punpcklbw xmm0, xmm6
    punpcklbw xmm4, xmm6
    psubw     xmm0, xmm4
    psubw     xmm1, xmm5
%ifidn %1, ac
    movd       r2d, xmm0
    pand      xmm0, [pb_subacmask]
%endif
    movdqa    [r0], xmm0
    pxor      xmm2, xmm2
    movdqa [r0+16], xmm1
    por       xmm0, xmm1
    pcmpeqb   xmm0, xmm2
    pmovmskb   eax, xmm0
%ifidn %1, ac
    mov       [r3], r2w
%endif
    sub        eax, 0xffff
    shr        eax, 31
    RET
%endmacro

%ifndef HIGH_BIT_DEPTH
INIT_XMM ssse3
ZIGZAG_SUB_4x4   , frame
ZIGZAG_SUB_4x4 ac, frame
ZIGZAG_SUB_4x4   , field
ZIGZAG_SUB_4x4 ac, field
INIT_XMM avx
ZIGZAG_SUB_4x4   , frame
ZIGZAG_SUB_4x4 ac, frame
ZIGZAG_SUB_4x4   , field
ZIGZAG_SUB_4x4 ac, field
%endif ; !HIGH_BIT_DEPTH

;-----------------------------------------------------------------------------
; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
;-----------------------------------------------------------------------------
%macro INTERLEAVE 2
    mova     m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
    mova     m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
    mova     m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
    mova     m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
    TRANSPOSE4x4%2 0,1,2,3,4
    mova     [r0+(%1+ 0)*SIZEOF_PIXEL], m0
    mova     [r0+(%1+32)*SIZEOF_PIXEL], m1
    mova     [r0+(%1+64)*SIZEOF_PIXEL], m2
    mova     [r0+(%1+96)*SIZEOF_PIXEL], m3
    packsswb m0, m1
%if %1
    por      m6, m2
    por      m7, m3
    por      m5, m0
%else
    SWAP      5, 0
    SWAP      6, 2
    SWAP      7, 3
%endif
%endmacro

%macro ZIGZAG_8x8_CAVLC 1
cglobal zigzag_interleave_8x8_cavlc, 3,3,8
    INTERLEAVE  0, %1
    INTERLEAVE  8, %1
    INTERLEAVE 16, %1
    INTERLEAVE 24, %1
    packsswb   m6, m7
    packsswb   m5, m6
    packsswb   m5, m5
    pxor       m0, m0
%ifdef HIGH_BIT_DEPTH
    packsswb   m5, m5
%endif
    pcmpeqb    m5, m0
    paddb      m5, [pb_1]
    movd      r0d, m5
    mov    [r2+0], r0w
    shr       r0d, 16
    mov    [r2+8], r0w
    RET
%endmacro

%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
ZIGZAG_8x8_CAVLC D
INIT_XMM avx
ZIGZAG_8x8_CAVLC D
%else
INIT_MMX mmx
ZIGZAG_8x8_CAVLC W
%endif

%macro INTERLEAVE_XMM 1
    mova   m0, [r1+%1*4+ 0]
    mova   m1, [r1+%1*4+16]
    mova   m4, [r1+%1*4+32]
    mova   m5, [r1+%1*4+48]
    SBUTTERFLY wd, 0, 1, 6
    SBUTTERFLY wd, 4, 5, 7
    SBUTTERFLY wd, 0, 1, 6
    SBUTTERFLY wd, 4, 5, 7
    movq   [r0+%1+  0], m0
    movhps [r0+%1+ 32], m0
    movq   [r0+%1+ 64], m1
    movhps [r0+%1+ 96], m1
    movq   [r0+%1+  8], m4
    movhps [r0+%1+ 40], m4
    movq   [r0+%1+ 72], m5
    movhps [r0+%1+104], m5
%if %1
    por    m2, m0
    por    m3, m1
    por    m2, m4
    por    m3, m5
%else
    SWAP 0,2
    SWAP 3,1
    por    m2, m4
    por    m3, m5
%endif
%endmacro

%ifndef HIGH_BIT_DEPTH
%macro ZIGZAG_8x8_CAVLC 0
cglobal zigzag_interleave_8x8_cavlc, 3,3,8
    INTERLEAVE_XMM  0
    INTERLEAVE_XMM 16
    packsswb m2, m3
    pxor     m5, m5
    packsswb m2, m2
    packsswb m2, m2
    pcmpeqb  m5, m2
    paddb    m5, [pb_1]
    movd    r0d, m5
    mov  [r2+0], r0w
    shr     r0d, 16
    mov  [r2+8], r0w
    RET
%endmacro

INIT_XMM sse2
ZIGZAG_8x8_CAVLC
INIT_XMM avx
ZIGZAG_8x8_CAVLC
%endif ; !HIGH_BIT_DEPTH
x264-snapshot-20120103-2245-stable/common/x86/dct-64.asm0000644000175000001440000001716011700673342021316 0ustar  videolanusers;*****************************************************************************
;* dct-64.asm: x86_64 transform and zigzag
;*****************************************************************************
;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Holger Lubitz <holger@lubitz.org>
;*          Laurent Aimar <fenrir@via.ecp.fr>
;*          Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION .text

%ifndef HIGH_BIT_DEPTH
cextern pw_32
cextern hsub_mul

%macro DCT8_1D 10
    SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
    SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
    SUMSUB_BA w, %7, %2 ; %7=s16, %2=d16
    SUMSUB_BA w, %8, %1 ; %8=s07, %1=d07

    SUMSUB_BA w, %6, %7, %10 ; %6=a1, %7=a3
    SUMSUB_BA w, %5, %8, %10 ; %5=a0, %8=a2

    psraw   m%9, m%1, 1
    paddw   m%9, m%1
    paddw   m%9, m%2
    paddw   m%9, m%3 ; %9=a4

    psraw   m%10, m%4, 1
    paddw   m%10, m%4
    paddw   m%10, m%2
    psubw   m%10, m%3 ; %10=a7

    SUMSUB_BA w, %4, %1
    psubw   m%1, m%3
    psubw   m%4, m%2
    psraw   m%3, 1
    psraw   m%2, 1
    psubw   m%1, m%3 ; %1=a5
    psubw   m%4, m%2 ; %4=a6

    psraw   m%2, m%10, 2
    paddw   m%2, m%9 ; %2=b1
    psraw   m%9, 2
    psubw   m%9, m%10 ; %9=b7

    SUMSUB_BA w, %6, %5, %10 ; %6=b0, %5=b4

    psraw   m%3, m%7, 1
    paddw   m%3, m%8 ; %3=b2
    psraw   m%8, 1
    psubw   m%8, m%7 ; %8=b6

    psraw   m%7, m%4, 2
    paddw   m%7, m%1 ; %7=b3
    psraw   m%1, 2
    psubw   m%4, m%1 ; %4=b5

    SWAP %1, %6, %4, %7, %8, %9
%endmacro

%macro IDCT8_1D 10
    SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2

    psraw   m%9, m%2, 1
    paddw   m%9, m%2
    paddw   m%9, m%4
    paddw   m%9, m%6  ; %9=a7

    psraw   m%10, m%3, 1
    psubw   m%10, m%7 ; %10=a4
    psraw   m%7, 1
    paddw   m%7, m%3  ; %7=a6

    psraw   m%3, m%6, 1
    paddw   m%3, m%6
    paddw   m%3, m%8
    psubw   m%3, m%2  ; %3=a5

    psubw   m%2, m%4
    psubw   m%6, m%4
    paddw   m%2, m%8
    psubw   m%6, m%8
    psraw   m%4, 1
    psraw   m%8, 1
    psubw   m%2, m%4  ; %2=a3
    psubw   m%6, m%8  ; %6=a1

    psraw   m%4, m%9, 2
    paddw   m%4, m%6  ; %4=b1
    psraw   m%6, 2
    psubw   m%9, m%6  ; %9=b7

    SUMSUB_BA w, %7, %5, %6  ;  %7=b0, %5=b6
    SUMSUB_BA w, %10, %1, %6 ; %10=b2, %1=b4

    psraw   m%8, m%3, 2
    paddw   m%8, m%2 ; %8=b3
    psraw   m%2, 2
    psubw   m%2, m%3 ; %2=b5

    SUMSUB_BA w, %9, %7, %6  ; %9=c0,  %7=c7
    SUMSUB_BA w, %2, %10, %6 ; %2=c1, %10=c6
    SUMSUB_BA w, %8, %1, %6  ; %8=c2,  %1=c5
    SUMSUB_BA w, %4, %5, %6  ; %4=c3,  %5=c4

    SWAP %10, %3
    SWAP  %1, %9, %6
    SWAP  %3, %8, %7
%endmacro

%macro DCT_SUB8 0
cglobal sub8x8_dct, 3,3,11
    add r2, 4*FDEC_STRIDE
%if cpuflag(ssse3)
    mova m7, [hsub_mul]
%endif
%ifdef WIN64
    call .skip_prologue
    RET
%endif
global current_function %+ .skip_prologue
.skip_prologue:
    SWAP 7, 9
    LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
    LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
    DCT4_1D 0, 1, 2, 3, 8
    TRANSPOSE2x4x4W 0, 1, 2, 3, 8
    DCT4_1D 4, 5, 6, 7, 8
    TRANSPOSE2x4x4W 4, 5, 6, 7, 8
    DCT4_1D 0, 1, 2, 3, 8
    STORE_DCT 0, 1, 2, 3, r0, 0
    DCT4_1D 4, 5, 6, 7, 8
    STORE_DCT 4, 5, 6, 7, r0, 64
    ret

;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub8x8_dct8, 3,3,11
    add r2, 4*FDEC_STRIDE
%if cpuflag(ssse3)
    mova m7, [hsub_mul]
%endif
%ifdef WIN64
    call .skip_prologue
    RET
%endif
global current_function %+ .skip_prologue
.skip_prologue:
    SWAP 7, 10
    LOAD_DIFF8x4  0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
    LOAD_DIFF8x4  4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
    DCT8_1D       0,1,2,3,4,5,6,7,8,9
    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
    DCT8_1D       0,1,2,3,4,5,6,7,8,9
    movdqa  [r0+0x00], m0
    movdqa  [r0+0x10], m1
    movdqa  [r0+0x20], m2
    movdqa  [r0+0x30], m3
    movdqa  [r0+0x40], m4
    movdqa  [r0+0x50], m5
    movdqa  [r0+0x60], m6
    movdqa  [r0+0x70], m7
    ret
%endmacro

INIT_XMM sse2
%define movdqa movaps
%define punpcklqdq movlhps
DCT_SUB8
%undef movdqa
%undef punpcklqdq
INIT_XMM ssse3
DCT_SUB8
INIT_XMM avx
DCT_SUB8

;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2,11
    add r0, 4*FDEC_STRIDE
    pxor m7, m7
%ifdef WIN64
    call .skip_prologue
    RET
%endif
global current_function %+ .skip_prologue
.skip_prologue:
    SWAP 7, 9
    movdqa  m0, [r1+0x00]
    movdqa  m1, [r1+0x10]
    movdqa  m2, [r1+0x20]
    movdqa  m3, [r1+0x30]
    movdqa  m4, [r1+0x40]
    movdqa  m5, [r1+0x50]
    movdqa  m6, [r1+0x60]
    movdqa  m7, [r1+0x70]
    IDCT8_1D      0,1,2,3,4,5,6,7,8,10
    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
    paddw         m0, [pw_32] ; rounding for the >>6 at the end
    IDCT8_1D      0,1,2,3,4,5,6,7,8,10
    DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
    DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
    DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
    DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
    STORE_IDCT m1, m3, m5, m7
    ret
%endmacro ; ADD8x8_IDCT8

INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8

;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD8x8 0
cglobal add8x8_idct, 2,2,11
    add  r0, 4*FDEC_STRIDE
    pxor m7, m7
%ifdef WIN64
    call .skip_prologue
    RET
%endif
global current_function %+ .skip_prologue
.skip_prologue:
    SWAP 7, 9
    mova   m0, [r1+ 0]
    mova   m2, [r1+16]
    mova   m1, [r1+32]
    mova   m3, [r1+48]
    SBUTTERFLY qdq, 0, 1, 4
    SBUTTERFLY qdq, 2, 3, 4
    mova   m4, [r1+64]
    mova   m6, [r1+80]
    mova   m5, [r1+96]
    mova   m7, [r1+112]
    SBUTTERFLY qdq, 4, 5, 8
    SBUTTERFLY qdq, 6, 7, 8
    IDCT4_1D w,0,1,2,3,8,10
    TRANSPOSE2x4x4W 0,1,2,3,8
    IDCT4_1D w,4,5,6,7,8,10
    TRANSPOSE2x4x4W 4,5,6,7,8
    paddw m0, [pw_32]
    IDCT4_1D w,0,1,2,3,8,10
    paddw m4, [pw_32]
    IDCT4_1D w,4,5,6,7,8,10
    DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
    DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
    DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
    DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
    STORE_IDCT m1, m3, m5, m7
    ret
%endmacro ; ADD8x8

INIT_XMM sse2
ADD8x8
INIT_XMM avx
ADD8x8
%endif ; !HIGH_BIT_DEPTH
x264-snapshot-20120103-2245-stable/common/x86/dct-32.asm0000644000175000001440000003252711700673342021315 0ustar  videolanusers;*****************************************************************************
;* dct-32.asm: x86_32 transform and zigzag
;*****************************************************************************
;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Holger Lubitz <holger@lubitz.org>
;*          Laurent Aimar <fenrir@via.ecp.fr>
;*          Min Chen <chenm001.163.com>
;*          Christian Heine <sennindemokrit@gmx.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION .text

%ifndef HIGH_BIT_DEPTH
cextern pw_32
cextern hsub_mul

; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
    SUMSUB_BA w, %8, %1      ; %8 = s07,  %1 = d07
    SUMSUB_BA w, %7, %2      ; %7 = s16,  %2 = d16
    SUMSUB_BA w, %6, %3      ; %6 = s25,  %3 = d25
    SUMSUB_BA w, %5, %4      ; %5 = s34,  %4 = d34
    SUMSUB_BA w, %5, %8      ; %5 = a0,   %8 = a2
    SUMSUB_BA w, %6, %7      ; %6 = a1,   %7 = a3
    SUMSUB_BA w, %6, %5      ; %6 = dst0, %5 = dst4
    mova    [%9+0x00], m%6
    mova    [%9+0x40], m%5
    psraw   m%6, m%7, 1      ; a3>>1
    paddw   m%6, m%8         ; a2 + (a3>>1)
    psraw   m%8, 1           ; a2>>1
    psubw   m%8, m%7         ; (a2>>1) - a3
    mova    [%9+0x60], m%8
    psraw   m%5, m%3, 1
    paddw   m%5, m%3         ; d25+(d25>>1)
    psubw   m%7, m%1, m%4    ; a5 = d07-d34-(d25+(d25>>1))
    psubw   m%7, m%5
    psraw   m%5, m%2, 1
    paddw   m%5, m%2         ; d16+(d16>>1)
    paddw   m%8, m%1, m%4
    psubw   m%8, m%5         ; a6 = d07+d34-(d16+(d16>>1))
    psraw   m%5, m%1, 1
    paddw   m%5, m%1         ; d07+(d07>>1)
    paddw   m%5, m%2
    paddw   m%5, m%3         ; a4 = d16+d25+(d07+(d07>>1))
    psraw   m%1, m%4, 1
    paddw   m%1, m%4         ; d34+(d34>>1)
    paddw   m%1, m%2
    psubw   m%1, m%3         ; a7 = d16-d25+(d34+(d34>>1))
    psraw   m%4, m%1, 2
    paddw   m%4, m%5         ; a4 + (a7>>2)
    psraw   m%3, m%8, 2
    paddw   m%3, m%7         ; a5 + (a6>>2)
    psraw   m%5, 2
    psraw   m%7, 2
    psubw   m%5, m%1         ; (a4>>2) - a7
    psubw   m%8, m%7         ; a6 - (a5>>2)
    SWAP %2, %4, %3, %6, %8, %5
%endmacro

; in: 0,4 in mem, rest in regs
; out: m0..m7
%macro IDCT8_1D 9
    psraw     m%1, m%3, 1
    psraw     m%5, m%7, 1
    psubw     m%1, m%7
    paddw     m%5, m%3
    psraw     m%7, m%2, 1
    paddw     m%7, m%2
    paddw     m%7, m%4
    paddw     m%7, m%6
    psraw     m%3, m%6, 1
    paddw     m%3, m%6
    paddw     m%3, m%8
    psubw     m%3, m%2
    psubw     m%2, m%4
    psubw     m%6, m%4
    paddw     m%2, m%8
    psubw     m%6, m%8
    psraw     m%4, 1
    psraw     m%8, 1
    psubw     m%2, m%4
    psubw     m%6, m%8
    psraw     m%4, m%7, 2
    psraw     m%8, m%3, 2
    paddw     m%4, m%6
    paddw     m%8, m%2
    psraw     m%6, 2
    psraw     m%2, 2
    psubw     m%7, m%6
    psubw     m%2, m%3
    mova      m%3, [%9+0x00]
    mova      m%6, [%9+0x40]
    SUMSUB_BA w, %6, %3
    SUMSUB_BA w, %5, %6
    SUMSUB_BA w, %1, %3
    SUMSUB_BA w, %7, %5
    SUMSUB_BA w, %2, %1
    SUMSUB_BA w, %8, %3
    SUMSUB_BA w, %4, %6
    SWAP %1, %3
    SWAP %5, %7
    SWAP %1, %5, %6
    SWAP %3, %8, %7
%endmacro

INIT_MMX
ALIGN 16
load_diff_4x8_mmx:
    LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
    LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
    LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
    LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
    LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
    LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
    movq  [r0], m0
    LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
    LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
    movq  m0, [r0]
    ret

cglobal dct8_mmx
    DCT8_1D 0,1,2,3,4,5,6,7,r0
    SAVE_MM_PERMUTATION
    ret

%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
    %xdefine %%base %1
    %rep %0/2
    %xdefine %%tmp m%2
    %rotate %0/2
    mova [%%base + %2*16], %%tmp
    %rotate 1-%0/2
    %endrep
%endmacro

%macro UNSPILL_SHUFFLE 3-*
    %xdefine %%base %1
    %rep %0/2
    %xdefine %%tmp m%2
    %rotate %0/2
    mova %%tmp, [%%base + %2*16]
    %rotate 1-%0/2
    %endrep
%endmacro

%macro SPILL 2+ ; assume offsets are the same as reg numbers
    SPILL_SHUFFLE %1, %2, %2
%endmacro

%macro UNSPILL 2+
    UNSPILL_SHUFFLE %1, %2, %2
%endmacro

;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub8x8_dct8_mmx, 3,3
global sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
    RESET_MM_PERMUTATION
    call load_diff_4x8_mmx
    call dct8_mmx
    UNSPILL r0, 0
    TRANSPOSE4x4W 0,1,2,3,4
    SPILL r0, 0,1,2,3
    UNSPILL r0, 4,6
    TRANSPOSE4x4W 4,5,6,7,0
    SPILL r0, 4,5,6,7
    RESET_MM_PERMUTATION
    add   r1, 4
    add   r2, 4
    add   r0, 8
    call load_diff_4x8_mmx
    sub   r1, 4
    sub   r2, 4
    call dct8_mmx
    sub   r0, 8
    UNSPILL r0+8, 4,6
    TRANSPOSE4x4W 4,5,6,7,0
    SPILL r0+8, 4,5,6,7
    UNSPILL r0+8, 0
    TRANSPOSE4x4W 0,1,2,3,5
    UNSPILL r0, 4,5,6,7
    SPILL_SHUFFLE r0, 0,1,2,3, 4,5,6,7
    movq  mm4, m6 ; depends on the permutation to not produce conflicts
    movq  mm0, m4
    movq  mm1, m5
    movq  mm2, mm4
    movq  mm3, m7
    RESET_MM_PERMUTATION
    UNSPILL r0+8, 4,5,6,7
    add   r0, 8
    call dct8_mmx
    sub   r0, 8
    SPILL r0+8, 1,2,3,5,7
    RESET_MM_PERMUTATION
    UNSPILL r0, 0,1,2,3,4,5,6,7
    call dct8_mmx
    SPILL r0, 1,2,3,5,7
    ret

cglobal idct8_mmx
    IDCT8_1D 0,1,2,3,4,5,6,7,r1
    SAVE_MM_PERMUTATION
    ret

%macro ADD_STORE_ROW 3
    movq  m1, [r0+%1*FDEC_STRIDE]
    punpckhbw m2, m1, m0
    punpcklbw m1, m0
    paddw m1, %2
    paddw m2, %3
    packuswb m1, m2
    movq  [r0+%1*FDEC_STRIDE], m1
%endmacro

;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal add8x8_idct8_mmx, 2,2
global add8x8_idct8_mmx.skip_prologue
.skip_prologue:
    INIT_MMX
    add word [r1], 32
    UNSPILL r1, 1,2,3,5,6,7
    call idct8_mmx
    SPILL r1, 7
    TRANSPOSE4x4W 0,1,2,3,7
    SPILL r1, 0,1,2,3
    UNSPILL r1, 7
    TRANSPOSE4x4W 4,5,6,7,0
    SPILL r1, 4,5,6,7
    INIT_MMX
    UNSPILL r1+8, 1,2,3,5,6,7
    add r1, 8
    call idct8_mmx
    sub r1, 8
    SPILL r1+8, 7
    TRANSPOSE4x4W 0,1,2,3,7
    SPILL r1+8, 0,1,2,3
    UNSPILL r1+8, 7
    TRANSPOSE4x4W 4,5,6,7,0
    SPILL r1+8, 4,5,6,7
    INIT_MMX
    movq  m3, [r1+0x08]
    movq  m0, [r1+0x40]
    movq  [r1+0x40], m3
    movq  [r1+0x08], m0
    ; memory layout at this time:
    ; A0------ A1------
    ; B0------ F0------
    ; C0------ G0------
    ; D0------ H0------
    ; E0------ E1------
    ; B1------ F1------
    ; C1------ G1------
    ; D1------ H1------
    UNSPILL_SHUFFLE r1, 1,2,3, 5,6,7
    UNSPILL r1+8, 5,6,7
    add r1, 8
    call idct8_mmx
    sub r1, 8
    psraw m0, 6
    psraw m1, 6
    psraw m2, 6
    psraw m3, 6
    psraw m4, 6
    psraw m5, 6
    psraw m6, 6
    psraw m7, 6
    movq  [r1+0x08], m0 ; mm4
    movq  [r1+0x48], m4 ; mm5
    movq  [r1+0x58], m5 ; mm0
    movq  [r1+0x68], m6 ; mm2
    movq  [r1+0x78], m7 ; mm6
    movq  mm5, [r1+0x18]
    movq  mm6, [r1+0x28]
    movq  [r1+0x18], m1 ; mm1
    movq  [r1+0x28], m2 ; mm7
    movq  mm7, [r1+0x38]
    movq  [r1+0x38], m3 ; mm3
    movq  mm1, [r1+0x10]
    movq  mm2, [r1+0x20]
    movq  mm3, [r1+0x30]
    call idct8_mmx
    psraw m0, 6
    psraw m1, 6
    psraw m2, 6
    psraw m3, 6
    psraw m4, 6
    psraw m5, 6
    psraw m6, 6
    psraw m7, 6
    SPILL r1, 0,1,2
    pxor  m0, m0
    ADD_STORE_ROW 0, [r1+0x00], [r1+0x08]
    ADD_STORE_ROW 1, [r1+0x10], [r1+0x18]
    ADD_STORE_ROW 2, [r1+0x20], [r1+0x28]
    ADD_STORE_ROW 3, m3, [r1+0x38]
    ADD_STORE_ROW 4, m4, [r1+0x48]
    ADD_STORE_ROW 5, m5, [r1+0x58]
    ADD_STORE_ROW 6, m6, [r1+0x68]
    ADD_STORE_ROW 7, m7, [r1+0x78]
    ret

%macro DCT_SUB8 0
cglobal sub8x8_dct, 3,3
    add r2, 4*FDEC_STRIDE
global current_function %+ .skip_prologue
.skip_prologue:
%if cpuflag(ssse3)
    mova m7, [hsub_mul]
%endif
    LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
    SPILL r0, 1,2
    SWAP 2, 7
    LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE
    UNSPILL r0, 1
    SPILL r0, 7
    SWAP 2, 7
    UNSPILL r0, 2
    DCT4_1D 0, 1, 2, 3, 7
    TRANSPOSE2x4x4W 0, 1, 2, 3, 7
    UNSPILL r0, 7
    SPILL r0, 2
    DCT4_1D 4, 5, 6, 7, 2
    TRANSPOSE2x4x4W 4, 5, 6, 7, 2
    UNSPILL r0, 2
    SPILL r0, 6
    DCT4_1D 0, 1, 2, 3, 6
    UNSPILL r0, 6
    STORE_DCT 0, 1, 2, 3, r0, 0
    DCT4_1D 4, 5, 6, 7, 3
    STORE_DCT 4, 5, 6, 7, r0, 64
    ret

;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub8x8_dct8, 3,3
    add r2, 4*FDEC_STRIDE
global current_function %+ .skip_prologue
.skip_prologue:
%if cpuflag(ssse3)
    mova m7, [hsub_mul]
    LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
    SPILL r0, 0,1
    SWAP 1, 7
    LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
    UNSPILL r0, 0,1
%else
    LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
    LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
    LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
    LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE]
    LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE]
    LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE]
    SPILL r0, 0
    LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
    LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
    UNSPILL r0, 0
%endif
    DCT8_1D 0,1,2,3,4,5,6,7,r0
    UNSPILL r0, 0,4
    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
    UNSPILL r0, 4
    DCT8_1D 0,1,2,3,4,5,6,7,r0
    SPILL r0, 1,2,3,5,7
    ret
%endmacro

INIT_XMM sse2
%define movdqa movaps
%define punpcklqdq movlhps
DCT_SUB8
%undef movdqa
%undef punpcklqdq
INIT_XMM ssse3
DCT_SUB8
INIT_XMM avx
DCT_SUB8

;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD8x8 0
cglobal add8x8_idct, 2,2
    add r0, 4*FDEC_STRIDE
global current_function %+ .skip_prologue
.skip_prologue:
    UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
    SBUTTERFLY qdq, 0, 1, 4
    SBUTTERFLY qdq, 2, 3, 4
    UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7
    SPILL r1, 0
    SBUTTERFLY qdq, 4, 5, 0
    SBUTTERFLY qdq, 6, 7, 0
    UNSPILL r1,0
    IDCT4_1D w,0,1,2,3,r1
    SPILL r1, 4
    TRANSPOSE2x4x4W 0,1,2,3,4
    UNSPILL r1, 4
    IDCT4_1D w,4,5,6,7,r1
    SPILL r1, 0
    TRANSPOSE2x4x4W 4,5,6,7,0
    UNSPILL r1, 0
    paddw m0, [pw_32]
    IDCT4_1D w,0,1,2,3,r1
    paddw m4, [pw_32]
    IDCT4_1D w,4,5,6,7,r1
    SPILL r1, 6,7
    pxor m7, m7
    DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
    DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
    UNSPILL_SHUFFLE r1, 0,2, 6,7
    DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
    DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
    STORE_IDCT m1, m3, m5, m2
    ret
%endmacro ; ADD8x8

INIT_XMM sse2
ADD8x8
INIT_XMM avx
ADD8x8

;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2
    add r0, 4*FDEC_STRIDE
global current_function %+ .skip_prologue
.skip_prologue:
    UNSPILL r1, 1,2,3,5,6,7
    IDCT8_1D   0,1,2,3,4,5,6,7,r1
    SPILL r1, 6
    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
    paddw      m0, [pw_32]
    SPILL r1, 0
    IDCT8_1D   0,1,2,3,4,5,6,7,r1
    SPILL r1, 6,7
    pxor       m7, m7
    DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
    DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
    UNSPILL_SHUFFLE r1, 0,2, 6,7
    DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
    DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
    STORE_IDCT m1, m3, m5, m2
    ret
%endmacro ; ADD8x8_IDCT8

INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
%endif ; !HIGH_BIT_DEPTH
x264-snapshot-20120103-2245-stable/common/x86/cpu-a.asm0000644000175000001440000000770111700673342021322 0ustar  videolanusers;*****************************************************************************
;* cpu-a.asm: x86 cpu utilities
;*****************************************************************************
;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*          Loren Merritt <lorenm@u.washington.edu>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"

SECTION .text

;-----------------------------------------------------------------------------
; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal cpu_cpuid, 5,7
    push rbx
    push  r4
    push  r3
    push  r2
    push  r1
    mov  eax, r0d
    cpuid
    pop  rsi
    mov [rsi], eax
    pop  rsi
    mov [rsi], ebx
    pop  rsi
    mov [rsi], ecx
    pop  rsi
    mov [rsi], edx
    pop  rbx
    RET

;-----------------------------------------------------------------------------
; void cpu_xgetbv( int op, int *eax, int *edx )
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv, 3,7
    push  r2
    push  r1
    mov  ecx, r0d
    xgetbv
    pop  rsi
    mov [rsi], eax
    pop  rsi
    mov [rsi], edx
    RET

%ifndef ARCH_X86_64

;-----------------------------------------------------------------------------
; int cpu_cpuid_test( void )
; return 0 if unsupported
;-----------------------------------------------------------------------------
cglobal cpu_cpuid_test
    pushfd
    push    ebx
    push    ebp
    push    esi
    push    edi
    pushfd
    pop     eax
    mov     ebx, eax
    xor     eax, 0x200000
    push    eax
    popfd
    pushfd
    pop     eax
    xor     eax, ebx
    pop     edi
    pop     esi
    pop     ebp
    pop     ebx
    popfd
    ret

;-----------------------------------------------------------------------------
; void stack_align( void (*func)(void*), void *arg );
;-----------------------------------------------------------------------------
cglobal stack_align
    push ebp
    mov  ebp, esp
    sub  esp, 12
    and  esp, ~15
    mov  ecx, [ebp+8]
    mov  edx, [ebp+12]
    mov  [esp], edx
    mov  edx, [ebp+16]
    mov  [esp+4], edx
    mov  edx, [ebp+20]
    mov  [esp+8], edx
    call ecx
    leave
    ret

%endif

;-----------------------------------------------------------------------------
; void cpu_emms( void )
;-----------------------------------------------------------------------------
cglobal cpu_emms
    emms
    ret

;-----------------------------------------------------------------------------
; void cpu_sfence( void )
;-----------------------------------------------------------------------------
cglobal cpu_sfence
    sfence
    ret

;-----------------------------------------------------------------------------
; void cpu_mask_misalign_sse( void )
;-----------------------------------------------------------------------------
cglobal cpu_mask_misalign_sse
    sub   rsp, 4
    stmxcsr [rsp]
    or dword [rsp], 1<<17
    ldmxcsr [rsp]
    add   rsp, 4
    ret
x264-snapshot-20120103-2245-stable/common/x86/const-a.asm0000644000175000001440000000454311700673342021662 0ustar  videolanusers;*****************************************************************************
;* const-a.asm: x86 global constants
;*****************************************************************************
;* Copyright (C) 2010-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"

SECTION_RODATA

const pb_01,       times  8 db 0,1
const pb_0,        times 16 db 0
const pb_a1,       times 16 db 0xa1
const pb_1,        times 16 db 1
const pb_3,        times 16 db 3
const hsub_mul,    times  8 db 1, -1
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6

const pw_1,        times 8 dw 1
const pw_2,        times 8 dw 2
const pw_4,        times 8 dw 4
const pw_8,        times 8 dw 8
const pw_16,       times 8 dw 16
const pw_32,       times 8 dw 32
const pw_64,       times 8 dw 64
const pw_32_0,     times 4 dw 32,
                   times 4 dw 0
const pw_8000,     times 8 dw 0x8000
const pw_3fff,     times 8 dw 0x3fff
const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0

const pd_1,        times 4 dd 1
const pd_32,       times 4 dd 32
const pd_1024,     times 4 dd 1024
const pd_ffff,     times 4 dd 0xffff
const pw_00ff,     times 8 dw 0x00ff
const pw_ff00,     times 8 dw 0xff00

const sw_64,       dd 64
x264-snapshot-20120103-2245-stable/common/x86/cabac-a.asm0000644000175000001440000001116111700673342021557 0ustar  videolanusers;*****************************************************************************
;* cabac-a.asm: x86 cabac
;*****************************************************************************
;* Copyright (C) 2008-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
;*          Holger Lubitz <holger@lubitz.org>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"

SECTION .text

cextern cabac_range_lps
cextern cabac_transition
cextern cabac_renorm_shift

; t3 must be ecx, since it's used for shift.
%ifdef WIN64
    DECLARE_REG_TMP 3,1,2,0,4,5,6,2
    %define pointer resq
%elifdef ARCH_X86_64
    DECLARE_REG_TMP 0,1,2,3,4,5,6,6
    %define pointer resq
%else
    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
    %define pointer resd
%endif

struc cb
    .low: resd 1
    .range: resd 1
    .queue: resd 1
    .bytes_outstanding: resd 1
    .start: pointer 1
    .p: pointer 1
    .end: pointer 1
    align 16, resb 1
    .bits_encoded: resd 1
    .state: resb 1024
endstruc

%macro LOAD_GLOBAL 4
%ifdef PIC
    ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
    lea   r11, [%2]
    %ifnidn %3, 0
    add   r11, %3
    %endif
    movzx %1, byte [r11+%4]
%else
    movzx %1, byte [%2+%3+%4]
%endif
%endmacro

cglobal cabac_encode_decision_asm, 0,7
    movifnidn t0,  r0mp
    movifnidn t1d, r1m
    mov   t5d, [t0+cb.range]
    movzx t6d, byte [t0+cb.state+t1]
    mov   t4d, ~1
    mov   t3d, t5d
    and   t4d, t6d
    shr   t5d, 6
    movifnidn t2d, r2m
    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
    and   t6d, 1
    sub   t3d, t5d
    cmp   t6d, t2d
    mov   t6d, [t0+cb.low]
    lea    t2, [t6+t3]
    cmovne t3d, t5d
    cmovne t6d, t2d
    mov   [t0+cb.state+t1], t4b
;cabac_encode_renorm
    mov   t4d, t3d
    shr   t3d, 3
    LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
    shl   t4d, t3b
    shl   t6d, t3b
    mov   [t0+cb.range], t4d
    add   t3d, [t0+cb.queue]
    jge cabac_putbyte
.update_queue_low:
    mov   [t0+cb.low], t6d
    mov   [t0+cb.queue], t3d
    RET

cglobal cabac_encode_bypass_asm, 0,3
    movifnidn  t0, r0mp
    movifnidn t3d, r1m
    mov       t7d, [t0+cb.low]
    and       t3d, [t0+cb.range]
    lea       t7d, [t7*2+t3]
    mov       t3d, [t0+cb.queue]
    inc       t3d
%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
    jge cabac_putbyte
%else
    jge .putbyte
%endif
    mov   [t0+cb.low], t7d
    mov   [t0+cb.queue], t3d
    RET
.putbyte:
    PROLOGUE 0,7
    movifnidn t6d, t7d
    jmp cabac_putbyte

cglobal cabac_encode_terminal_asm, 0,3
    movifnidn  t0, r0mp
    sub  dword [t0+cb.range], 2
; shortcut: the renormalization shift in terminal
; can only be 0 or 1 and is zero over 99% of the time.
    test dword [t0+cb.range], 0x100
    je .renorm
    REP_RET
.renorm:
    shl  dword [t0+cb.low], 1
    shl  dword [t0+cb.range], 1
    inc  dword [t0+cb.queue]
    jge .putbyte
    REP_RET
.putbyte:
    PROLOGUE 0,7
    mov t3d, [t0+cb.queue]
    mov t6d, [t0+cb.low]
    jmp cabac_putbyte

cabac_putbyte:
    ; alive: t0=cb t3=queue t6=low
%ifdef WIN64
    DECLARE_REG_TMP 3,4,1,0,2,5,6,10
%endif
    mov   t1d, -1
    add   t3d, 10
    mov   t2d, t6d
    shl   t1d, t3b
    shr   t2d, t3b ; out
    not   t1d
    sub   t3d, 18
    and   t6d, t1d
    mov   t5d, [t0+cb.bytes_outstanding]
    cmp   t2b, 0xff ; FIXME is a 32bit op faster?
    jz    .postpone
    mov    t1, [t0+cb.p]
    add   [t1-1], dh ; t2h
    dec   dh
.loop_outstanding:
    mov   [t1], dh
    inc   t1
    dec   t5d
    jge .loop_outstanding
    mov   [t1-1], t2b
    mov   [t0+cb.p], t1
.postpone:
    inc   t5d
    mov   [t0+cb.bytes_outstanding], t5d
    jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
x264-snapshot-20120103-2245-stable/common/x86/bitstream-a.asm0000644000175000001440000000700511700673342022522 0ustar  videolanusers;*****************************************************************************
;* bitstream-a.asm: x86 bitstream functions
;*****************************************************************************
;* Copyright (C) 2010-2011 x264 project
;*
;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
;*          Henrik Gramner <hengar-6@student.ltu.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION .text

;-----------------------------------------------------------------------------
; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
;-----------------------------------------------------------------------------

%macro NAL_LOOP 2
%1_escape:
    ; Detect false positive to avoid unneccessary escape loop
    xor      r3d, r3d
    cmp byte [r0+r1-1], 0
    setnz    r3b
    xor      r3d, r4d
    jnz .escape
    jmp %1_continue
ALIGN 16
%1:
    pcmpeqb   m3, m1, m4
    pcmpeqb   m2, m0, m4
    pmovmskb r3d, m3
    %2   [r0+r1], m0
    pmovmskb r4d, m2
    shl      r3d, mmsize
    mova      m0, [r1+r2+2*mmsize]
    or       r4d, r3d
    %2 [r0+r1+mmsize], m1
    lea      r3d, [r4+r4+1]
    mova      m1, [r1+r2+3*mmsize]
    and      r4d, r3d
    jnz %1_escape
%1_continue:
    add       r1, 2*mmsize
    jl %1
%endmacro

%macro NAL_ESCAPE 0

cglobal nal_escape, 3,5
    mov      r3w, [r1]
    sub       r1, r2 ; r1 = offset of current src pointer from end of src
    pxor      m4, m4
    sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
    mov  [r0+r1], r3w
    add       r1, 2
    jge .ret

    ; Start off by jumping into the escape loop in
    ; case there's an escape at the start.
    ; And do a few more in scalar until src is aligned again.
    jmp .first_escape

    NAL_LOOP .loop_aligned, mova
%if mmsize==16
    jmp .ret
    NAL_LOOP .loop_unaligned, movu
%endif
.ret:
    movifnidn rax, r0
    RET

ALIGN 16
.escape:
    ; Skip bytes that are known to be valid
    and      r4d, r3d
    bsf      r3d, r4d
    add       r1, r3
.escape_loop:
    inc       r1
    jge .ret
.first_escape:
    movzx    r3d, byte [r1+r2]
    lea       r4, [r1+r2]
    cmp      r3d, 3
    jna .escape_check
.no_escape:
    mov  [r0+r1], r3b
    test     r4d, mmsize-1 ; Do SIMD when src is aligned
    jnz .escape_loop
    mova      m0, [r4]
    mova      m1, [r4+mmsize]
%if mmsize==16
    lea      r4d, [r0+r1]
    test     r4d, mmsize-1
    jnz .loop_unaligned
%endif
    jmp .loop_aligned

ALIGN 16
.escape_check:
    cmp word [r0+r1-2], 0
    jnz .no_escape
    mov byte [r0+r1], 3
    inc      r0
    jmp .no_escape
%endmacro

INIT_MMX mmx2
NAL_ESCAPE
INIT_XMM sse2
NAL_ESCAPE
INIT_XMM avx
NAL_ESCAPE
x264-snapshot-20120103-2245-stable/common/win32thread.h0000644000175000001440000000600611700673342021466 0ustar  videolanusers/*****************************************************************************
 * win32thread.h: windows threading
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_WIN32THREAD_H
#define X264_WIN32THREAD_H

#define WIN32_LEAN_AND_MEAN
#include <windows.h>
/* the following macro is used within x264 */
#undef ERROR

typedef struct
{
    void *handle;
    void *(*func)( void* arg );
    void *arg;
    void *ret;
} x264_pthread_t;
#define x264_pthread_attr_t int

/* the conditional variable api for windows 6.0+ uses critical sections and not mutexes */
typedef CRITICAL_SECTION x264_pthread_mutex_t;
#define X264_PTHREAD_MUTEX_INITIALIZER {0}
#define x264_pthread_mutexattr_t int

/* This is the CONDITIONAL_VARIABLE typedef for using Window's native conditional variables on kernels 6.0+.
 * MinGW does not currently have this typedef. */
typedef struct
{
    void *ptr;
} x264_pthread_cond_t;
#define x264_pthread_condattr_t int

int x264_pthread_create( x264_pthread_t *thread, const x264_pthread_attr_t *attr,
                         void *(*start_routine)( void* ), void *arg );
int x264_pthread_join( x264_pthread_t thread, void **value_ptr );

int x264_pthread_mutex_init( x264_pthread_mutex_t *mutex, const x264_pthread_mutexattr_t *attr );
int x264_pthread_mutex_destroy( x264_pthread_mutex_t *mutex );
int x264_pthread_mutex_lock( x264_pthread_mutex_t *mutex );
int x264_pthread_mutex_unlock( x264_pthread_mutex_t *mutex );

int x264_pthread_cond_init( x264_pthread_cond_t *cond, const x264_pthread_condattr_t *attr );
int x264_pthread_cond_destroy( x264_pthread_cond_t *cond );
int x264_pthread_cond_broadcast( x264_pthread_cond_t *cond );
int x264_pthread_cond_wait( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex );
int x264_pthread_cond_signal( x264_pthread_cond_t *cond );

#define x264_pthread_attr_init(a) 0
#define x264_pthread_attr_destroy(a) 0

int  x264_win32_threading_init( void );
void x264_win32_threading_destroy( void );

int x264_pthread_num_processors_np( void );

#endif
x264-snapshot-20120103-2245-stable/common/win32thread.c0000644000175000001440000002432311700673342021463 0ustar  videolanusers/*****************************************************************************
 * win32thread.c: windows threading
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *          Pegasys Inc. <http://www.pegasys-inc.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

/* Microsoft's way of supporting systems with >64 logical cpus can be found at
 * http://www.microsoft.com/whdc/system/Sysinternals/MoreThan64proc.mspx */

/* Based on the agreed standing that x264 does not need to utilize >64 logical cpus,
 * this API does not detect nor utilize more than 64 cpus for systems that have them. */

#include "common.h"
#include <process.h>

/* number of times to spin a thread about to block on a locked mutex before retrying and sleeping if still locked */
#define X264_SPIN_COUNT 0

/* GROUP_AFFINITY struct */
typedef struct
{
    ULONG_PTR mask; // KAFFINITY = ULONG_PTR
    USHORT group;
    USHORT reserved[3];
} x264_group_affinity_t;

typedef struct
{
    /* global mutex for replacing MUTEX_INITIALIZER instances */
    x264_pthread_mutex_t static_mutex;

    /* function pointers to conditional variable API on windows 6.0+ kernels */
    void (WINAPI *cond_broadcast)( x264_pthread_cond_t *cond );
    void (WINAPI *cond_init)( x264_pthread_cond_t *cond );
    void (WINAPI *cond_signal)( x264_pthread_cond_t *cond );
    BOOL (WINAPI *cond_wait)( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex, DWORD milliseconds );
} x264_win32thread_control_t;

static x264_win32thread_control_t thread_control;

/* _beginthreadex requires that the start routine is __stdcall */
static unsigned __stdcall x264_win32thread_worker( void *arg )
{
    x264_pthread_t *h = arg;
    h->ret = h->func( h->arg );
    return 0;
}

int x264_pthread_create( x264_pthread_t *thread, const x264_pthread_attr_t *attr,
                         void *(*start_routine)( void* ), void *arg )
{
    thread->func   = start_routine;
    thread->arg    = arg;
    thread->handle = (void*)_beginthreadex( NULL, 0, x264_win32thread_worker, thread, 0, NULL );
    return !thread->handle;
}

int x264_pthread_join( x264_pthread_t thread, void **value_ptr )
{
    DWORD ret = WaitForSingleObject( thread.handle, INFINITE );
    if( ret != WAIT_OBJECT_0 )
        return -1;
    if( value_ptr )
        *value_ptr = thread.ret;
    CloseHandle( thread.handle );
    return 0;
}

int x264_pthread_mutex_init( x264_pthread_mutex_t *mutex, const x264_pthread_mutexattr_t *attr )
{
    return !InitializeCriticalSectionAndSpinCount( mutex, X264_SPIN_COUNT );
}

int x264_pthread_mutex_destroy( x264_pthread_mutex_t *mutex )
{
    DeleteCriticalSection( mutex );
    return 0;
}

int x264_pthread_mutex_lock( x264_pthread_mutex_t *mutex )
{
    static x264_pthread_mutex_t init = X264_PTHREAD_MUTEX_INITIALIZER;
    if( !memcmp( mutex, &init, sizeof(x264_pthread_mutex_t) ) )
        *mutex = thread_control.static_mutex;
    EnterCriticalSection( mutex );
    return 0;
}

int x264_pthread_mutex_unlock( x264_pthread_mutex_t *mutex )
{
    LeaveCriticalSection( mutex );
    return 0;
}

/* for pre-Windows 6.0 platforms we need to define and use our own condition variable and api */
typedef struct
{
    x264_pthread_mutex_t mtx_broadcast;
    x264_pthread_mutex_t mtx_waiter_count;
    int waiter_count;
    HANDLE semaphore;
    HANDLE waiters_done;
    int is_broadcast;
} x264_win32_cond_t;

int x264_pthread_cond_init( x264_pthread_cond_t *cond, const x264_pthread_condattr_t *attr )
{
    if( thread_control.cond_init )
    {
        thread_control.cond_init( cond );
        return 0;
    }

    /* non native condition variables */
    x264_win32_cond_t *win32_cond = calloc( 1, sizeof(x264_win32_cond_t) );
    if( !win32_cond )
        return -1;
    cond->ptr = win32_cond;
    win32_cond->semaphore = CreateSemaphore( NULL, 0, 0x7fffffff, NULL );
    if( !win32_cond->semaphore )
        return -1;

    if( x264_pthread_mutex_init( &win32_cond->mtx_waiter_count, NULL ) )
        return -1;
    if( x264_pthread_mutex_init( &win32_cond->mtx_broadcast, NULL ) )
        return -1;

    win32_cond->waiters_done = CreateEvent( NULL, FALSE, FALSE, NULL );
    if( !win32_cond->waiters_done )
        return -1;

    return 0;
}

int x264_pthread_cond_destroy( x264_pthread_cond_t *cond )
{
    /* native condition variables do not destroy */
    if( thread_control.cond_init )
        return 0;

    /* non native condition variables */
    x264_win32_cond_t *win32_cond = cond->ptr;
    CloseHandle( win32_cond->semaphore );
    CloseHandle( win32_cond->waiters_done );
    x264_pthread_mutex_destroy( &win32_cond->mtx_broadcast );
    x264_pthread_mutex_destroy( &win32_cond->mtx_waiter_count );
    free( win32_cond );

    return 0;
}

int x264_pthread_cond_broadcast( x264_pthread_cond_t *cond )
{
    if( thread_control.cond_broadcast )
    {
        thread_control.cond_broadcast( cond );
        return 0;
    }

    /* non native condition variables */
    x264_win32_cond_t *win32_cond = cond->ptr;
    x264_pthread_mutex_lock( &win32_cond->mtx_broadcast );
    x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count );
    int have_waiter = 0;

    if( win32_cond->waiter_count )
    {
        win32_cond->is_broadcast = 1;
        have_waiter = 1;
    }

    if( have_waiter )
    {
        ReleaseSemaphore( win32_cond->semaphore, win32_cond->waiter_count, NULL );
        x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );
        WaitForSingleObject( win32_cond->waiters_done, INFINITE );
        win32_cond->is_broadcast = 0;
    }
    else
        x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );
    return x264_pthread_mutex_unlock( &win32_cond->mtx_broadcast );
}

int x264_pthread_cond_signal( x264_pthread_cond_t *cond )
{
    if( thread_control.cond_signal )
    {
        thread_control.cond_signal( cond );
        return 0;
    }

    /* non-native condition variables */
    x264_win32_cond_t *win32_cond = cond->ptr;
    x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count );
    int have_waiter = win32_cond->waiter_count;
    x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );

    if( have_waiter )
        ReleaseSemaphore( win32_cond->semaphore, 1, NULL );
    return 0;
}

int x264_pthread_cond_wait( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex )
{
    if( thread_control.cond_wait )
        return !thread_control.cond_wait( cond, mutex, INFINITE );

    /* non native condition variables */
    x264_win32_cond_t *win32_cond = cond->ptr;

    x264_pthread_mutex_lock( &win32_cond->mtx_broadcast );
    x264_pthread_mutex_unlock( &win32_cond->mtx_broadcast );

    x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count );
    win32_cond->waiter_count++;
    x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );

    // unlock the external mutex
    x264_pthread_mutex_unlock( mutex );
    WaitForSingleObject( win32_cond->semaphore, INFINITE );

    x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count );
    win32_cond->waiter_count--;
    int last_waiter = !win32_cond->waiter_count && win32_cond->is_broadcast;
    x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );

    if( last_waiter )
        SetEvent( win32_cond->waiters_done );

    // lock the external mutex
    return x264_pthread_mutex_lock( mutex );
}

int x264_win32_threading_init( void )
{
    /* find function pointers to API functions, if they exist */
    HANDLE kernel_dll = GetModuleHandle( TEXT( "kernel32.dll" ) );
    thread_control.cond_init = (void*)GetProcAddress( kernel_dll, "InitializeConditionVariable" );
    if( thread_control.cond_init )
    {
        /* we're on a windows 6.0+ kernel, acquire the rest of the functions */
        thread_control.cond_broadcast = (void*)GetProcAddress( kernel_dll, "WakeAllConditionVariable" );
        thread_control.cond_signal = (void*)GetProcAddress( kernel_dll, "WakeConditionVariable" );
        thread_control.cond_wait = (void*)GetProcAddress( kernel_dll, "SleepConditionVariableCS" );
    }
    return x264_pthread_mutex_init( &thread_control.static_mutex, NULL );
}

void x264_win32_threading_destroy( void )
{
    x264_pthread_mutex_destroy( &thread_control.static_mutex );
    memset( &thread_control, 0, sizeof(x264_win32thread_control_t) );
}

int x264_pthread_num_processors_np()
{
    DWORD_PTR system_cpus, process_cpus = 0;
    int cpus = 0;

    /* GetProcessAffinityMask returns affinities of 0 when the process has threads in multiple processor groups.
     * On platforms that support processor grouping, use GetThreadGroupAffinity to get the current thread's affinity instead. */
#if ARCH_X86_64
    /* find function pointers to API functions specific to x86_64 platforms, if they exist */
    HANDLE kernel_dll = GetModuleHandle( TEXT( "kernel32.dll" ) );
    BOOL (*get_thread_affinity)( HANDLE thread, x264_group_affinity_t *group_affinity ) = (void*)GetProcAddress( kernel_dll, "GetThreadGroupAffinity" );
    if( get_thread_affinity )
    {
        /* running on a platform that supports >64 logical cpus */
        x264_group_affinity_t thread_affinity;
        if( get_thread_affinity( GetCurrentThread(), &thread_affinity ) )
            process_cpus = thread_affinity.mask;
    }
#endif
    if( !process_cpus )
        GetProcessAffinityMask( GetCurrentProcess(), &process_cpus, &system_cpus );
    for( DWORD_PTR bit = 1; bit; bit <<= 1 )
        cpus += !!(process_cpus & bit);

    return cpus ? cpus : 1;
}
x264-snapshot-20120103-2245-stable/common/vlc.c0000644000175000001440000006630711700673342020125 0ustar  videolanusers/*****************************************************************************
 * vlc.c : vlc tables
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Henrik Gramner <hengar-6@student.ltu.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

/* [nC] */
const vlc_t x264_coeff0_token[6] =
{
    { 0x1, 1 }, /* str=1 */
    { 0x3, 2 }, /* str=11 */
    { 0xf, 4 }, /* str=1111 */
    { 0x3, 6 }, /* str=000011 */
    { 0x1, 2 }, /* str=01 */
    { 0x1, 1 }, /* str=1 */
};

/* [nC][i_total_coeff-1][i_trailing] */
const vlc_t x264_coeff_token[6][16][4] =
{
    { /* table 0 */
        { /* i_total 1 */
            { 0x5, 6 }, /* str=000101 */
            { 0x1, 2 }, /* str=01 */
        },
        { /* i_total 2 */
            { 0x7, 8 }, /* str=00000111 */
            { 0x4, 6 }, /* str=000100 */
            { 0x1, 3 }, /* str=001 */
        },
        { /* i_total 3 */
            { 0x7, 9 }, /* str=000000111 */
            { 0x6, 8 }, /* str=00000110 */
            { 0x5, 7 }, /* str=0000101 */
            { 0x3, 5 }, /* str=00011 */
        },
        { /* i_total 4 */
            { 0x7, 10 }, /* str=0000000111 */
            { 0x6, 9 },  /* str=000000110 */
            { 0x5, 8 },  /* str=00000101 */
            { 0x3, 6 },  /* str=000011 */
        },
        { /* i_total 5 */
            { 0x7, 11 }, /* str=00000000111 */
            { 0x6, 10 }, /* str=0000000110 */
            { 0x5, 9 },  /* str=000000101 */
            { 0x4, 7 },  /* str=0000100 */
        },
        { /* i_total 6 */
            { 0xf, 13 }, /* str=0000000001111 */
            { 0x6, 11 }, /* str=00000000110 */
            { 0x5, 10 }, /* str=0000000101 */
            { 0x4, 8 },  /* str=00000100 */
        },
        { /* i_total 7 */
            { 0xb, 13 }, /* str=0000000001011 */
            { 0xe, 13 }, /* str=0000000001110 */
            { 0x5, 11 }, /* str=00000000101 */
            { 0x4, 9 },  /* str=000000100 */
        },
        { /* i_total 8 */
            { 0x8, 13 }, /* str=0000000001000 */
            { 0xa, 13 }, /* str=0000000001010 */
            { 0xd, 13 }, /* str=0000000001101 */
            { 0x4, 10 }, /* str=0000000100 */
        },
        { /* i_total 9 */
            { 0xf, 14 }, /* str=00000000001111 */
            { 0xe, 14 }, /* str=00000000001110 */
            { 0x9, 13 }, /* str=0000000001001 */
            { 0x4, 11 }, /* str=00000000100 */
        },
        { /* i_total 10 */
            { 0xb, 14 }, /* str=00000000001011 */
            { 0xa, 14 }, /* str=00000000001010 */
            { 0xd, 14 }, /* str=00000000001101 */
            { 0xc, 13 }, /* str=0000000001100 */
        },
        { /* i_total 14 */
            { 0xf, 15 }, /* str=000000000001111 */
            { 0xe, 15 }, /* str=000000000001110 */
            { 0x9, 14 }, /* str=00000000001001 */
            { 0xc, 14 }, /* str=00000000001100 */
        },
        { /* i_total 12 */
            { 0xb, 15 }, /* str=000000000001011 */
            { 0xa, 15 }, /* str=000000000001010 */
            { 0xd, 15 }, /* str=000000000001101 */
            { 0x8, 14 }, /* str=00000000001000 */
        },
        { /* i_total 13 */
            { 0xf, 16 }, /* str=0000000000001111 */
            { 0x1, 15 }, /* str=000000000000001 */
            { 0x9, 15 }, /* str=000000000001001 */
            { 0xc, 15 }, /* str=000000000001100 */
        },
        { /* i_total 14 */
            { 0xb, 16 }, /* str=0000000000001011 */
            { 0xe, 16 }, /* str=0000000000001110 */
            { 0xd, 16 }, /* str=0000000000001101 */
            { 0x8, 15 }, /* str=000000000001000 */
        },
        { /* i_total 15 */
            { 0x7, 16 }, /* str=0000000000000111 */
            { 0xa, 16 }, /* str=0000000000001010 */
            { 0x9, 16 }, /* str=0000000000001001 */
            { 0xc, 16 }, /* str=0000000000001100 */
        },
        { /* i_total 16 */
            { 0x4, 16 }, /* str=0000000000000100 */
            { 0x6, 16 }, /* str=0000000000000110 */
            { 0x5, 16 }, /* str=0000000000000101 */
            { 0x8, 16 }, /* str=0000000000001000 */
        },
    },
    { /* table 1 */
        { /* i_total 1 */
            { 0xb, 6 }, /* str=001011 */
            { 0x2, 2 }, /* str=10 */
        },
        { /* i_total 2 */
            { 0x7, 6 }, /* str=000111 */
            { 0x7, 5 }, /* str=00111 */
            { 0x3, 3 }, /* str=011 */
        },
        { /* i_total 3 */
            { 0x7, 7 }, /* str=0000111 */
            { 0xa, 6 }, /* str=001010 */
            { 0x9, 6 }, /* str=001001 */
            { 0x5, 4 }, /* str=0101 */
        },
        { /* i_total 4 */
            { 0x7, 8 }, /* str=00000111 */
            { 0x6, 6 }, /* str=000110 */
            { 0x5, 6 }, /* str=000101 */
            { 0x4, 4 }, /* str=0100 */
        },
        { /* i_total 5 */
            { 0x4, 8 }, /* str=00000100 */
            { 0x6, 7 }, /* str=0000110 */
            { 0x5, 7 }, /* str=0000101 */
            { 0x6, 5 }, /* str=00110 */
        },
        { /* i_total 6 */
            { 0x7, 9 }, /* str=000000111 */
            { 0x6, 8 }, /* str=00000110 */
            { 0x5, 8 }, /* str=00000101 */
            { 0x8, 6 }, /* str=001000 */
        },
        { /* i_total 7 */
            { 0xf, 11 }, /* str=00000001111 */
            { 0x6, 9 },  /* str=000000110 */
            { 0x5, 9 },  /* str=000000101 */
            { 0x4, 6 },  /* str=000100 */
        },
        { /* i_total 8 */
            { 0xb, 11 }, /* str=00000001011 */
            { 0xe, 11 }, /* str=00000001110 */
            { 0xd, 11 }, /* str=00000001101 */
            { 0x4, 7 },  /* str=0000100 */
        },
        { /* i_total 9 */
            { 0xf, 12 }, /* str=000000001111 */
            { 0xa, 11 }, /* str=00000001010 */
            { 0x9, 11 }, /* str=00000001001 */
            { 0x4, 9 },  /* str=000000100 */
        },
        { /* i_total 10 */
            { 0xb, 12 }, /* str=000000001011 */
            { 0xe, 12 }, /* str=000000001110 */
            { 0xd, 12 }, /* str=000000001101 */
            { 0xc, 11 }, /* str=00000001100 */
        },
        { /* i_total 11 */
            { 0x8, 12 }, /* str=000000001000 */
            { 0xa, 12 }, /* str=000000001010 */
            { 0x9, 12 }, /* str=000000001001 */
            { 0x8, 11 }, /* str=00000001000 */
        },
        { /* i_total 12 */
            { 0xf, 13 }, /* str=0000000001111 */
            { 0xe, 13 }, /* str=0000000001110 */
            { 0xd, 13 }, /* str=0000000001101 */
            { 0xc, 12 }, /* str=000000001100 */
        },
        { /* i_total 13 */
            { 0xb, 13 }, /* str=0000000001011 */
            { 0xa, 13 }, /* str=0000000001010 */
            { 0x9, 13 }, /* str=0000000001001 */
            { 0xc, 13 }, /* str=0000000001100 */
        },
        { /* i_total 14 */
            { 0x7, 13 }, /* str=0000000000111 */
            { 0xb, 14 }, /* str=00000000001011 */
            { 0x6, 13 }, /* str=0000000000110 */
            { 0x8, 13 }, /* str=0000000001000 */
        },
        { /* i_total 15 */
            { 0x9, 14 }, /* str=00000000001001 */
            { 0x8, 14 }, /* str=00000000001000 */
            { 0xa, 14 }, /* str=00000000001010 */
            { 0x1, 13 }, /* str=0000000000001 */
        },
        { /* i_total 16 */
            { 0x7, 14 }, /* str=00000000000111 */
            { 0x6, 14 }, /* str=00000000000110 */
            { 0x5, 14 }, /* str=00000000000101 */
            { 0x4, 14 }, /* str=00000000000100 */
        },
    },
    { /* table 2 */
        { /* i_total 1 */
            { 0xf, 6 }, /* str=001111 */
            { 0xe, 4 }, /* str=1110 */
        },
        { /* i_total 2 */
            { 0xb, 6 }, /* str=001011 */
            { 0xf, 5 }, /* str=01111 */
            { 0xd, 4 }, /* str=1101 */
        },
        { /* i_total 3 */
            { 0x8, 6 }, /* str=001000 */
            { 0xc, 5 }, /* str=01100 */
            { 0xe, 5 }, /* str=01110 */
            { 0xc, 4 }, /* str=1100 */
        },
        { /* i_total 4 */
            { 0xf, 7 }, /* str=0001111 */
            { 0xa, 5 }, /* str=01010 */
            { 0xb, 5 }, /* str=01011 */
            { 0xb, 4 }, /* str=1011 */
        },
        { /* i_total 5 */
            { 0xb, 7 }, /* str=0001011 */
            { 0x8, 5 }, /* str=01000 */
            { 0x9, 5 }, /* str=01001 */
            { 0xa, 4 }, /* str=1010 */
        },
        { /* i_total 6 */
            { 0x9, 7 }, /* str=0001001 */
            { 0xe, 6 }, /* str=001110 */
            { 0xd, 6 }, /* str=001101 */
            { 0x9, 4 }, /* str=1001 */
        },
        { /* i_total 7 */
            { 0x8, 7 }, /* str=0001000 */
            { 0xa, 6 }, /* str=001010 */
            { 0x9, 6 }, /* str=001001 */
            { 0x8, 4 }, /* str=1000 */
        },
        { /* i_total 8 */
            { 0xf, 8 }, /* str=00001111 */
            { 0xe, 7 }, /* str=0001110 */
            { 0xd, 7 }, /* str=0001101 */
            { 0xd, 5 }, /* str=01101 */
        },
        { /* i_total 9 */
            { 0xb, 8 }, /* str=00001011 */
            { 0xe, 8 }, /* str=00001110 */
            { 0xa, 7 }, /* str=0001010 */
            { 0xc, 6 }, /* str=001100 */
        },
        { /* i_total 10 */
            { 0xf, 9 }, /* str=000001111 */
            { 0xa, 8 }, /* str=00001010 */
            { 0xd, 8 }, /* str=00001101 */
            { 0xc, 7 }, /* str=0001100 */
        },
        { /* i_total 11 */
            { 0xb, 9 }, /* str=000001011 */
            { 0xe, 9 }, /* str=000001110 */
            { 0x9, 8 }, /* str=00001001 */
            { 0xc, 8 }, /* str=00001100 */
        },
        { /* i_total 12 */
            { 0x8, 9 }, /* str=000001000 */
            { 0xa, 9 }, /* str=000001010 */
            { 0xd, 9 }, /* str=000001101 */
            { 0x8, 8 }, /* str=00001000 */
        },
        { /* i_total 13 */
            { 0xd, 10 }, /* str=0000001101 */
            { 0x7, 9 },  /* str=000000111 */
            { 0x9, 9 },  /* str=000001001 */
            { 0xc, 9 },  /* str=000001100 */
        },
        { /* i_total 14 */
            { 0x9, 10 }, /* str=0000001001 */
            { 0xc, 10 }, /* str=0000001100 */
            { 0xb, 10 }, /* str=0000001011 */
            { 0xa, 10 }, /* str=0000001010 */
        },
        { /* i_total 15 */
            { 0x5, 10 }, /* str=0000000101 */
            { 0x8, 10 }, /* str=0000001000 */
            { 0x7, 10 }, /* str=0000000111 */
            { 0x6, 10 }, /* str=0000000110 */
        },
        { /* i_total 16 */
            { 0x1, 10 }, /* str=0000000001 */
            { 0x4, 10 }, /* str=0000000100 */
            { 0x3, 10 }, /* str=0000000011 */
            { 0x2, 10 }, /* str=0000000010 */
        },
    },
    { /* table 3 */
        { /* i_total 1 */
            { 0x0, 6 }, /* str=000000 */
            { 0x1, 6 }, /* str=000001 */
        },
        { /* i_total 2 */
            { 0x4, 6 }, /* str=000100 */
            { 0x5, 6 }, /* str=000101 */
            { 0x6, 6 }, /* str=000110 */
        },
        { /* i_total 3 */
            { 0x8, 6 }, /* str=001000 */
            { 0x9, 6 }, /* str=001001 */
            { 0xa, 6 }, /* str=001010 */
            { 0xb, 6 }, /* str=001011 */
        },
        { /* i_total 4 */
            { 0xc, 6 }, /* str=001100 */
            { 0xd, 6 }, /* str=001101 */
            { 0xe, 6 }, /* str=001110 */
            { 0xf, 6 }, /* str=001111 */
        },
        { /* i_total 5 */
            { 0x10, 6 }, /* str=010000 */
            { 0x11, 6 }, /* str=010001 */
            { 0x12, 6 }, /* str=010010 */
            { 0x13, 6 }, /* str=010011 */
        },
        { /* i_total 6 */
            { 0x14, 6 }, /* str=010100 */
            { 0x15, 6 }, /* str=010101 */
            { 0x16, 6 }, /* str=010110 */
            { 0x17, 6 }, /* str=010111 */
        },
        { /* i_total 7 */
            { 0x18, 6 }, /* str=011000 */
            { 0x19, 6 }, /* str=011001 */
            { 0x1a, 6 }, /* str=011010 */
            { 0x1b, 6 }, /* str=011011 */
        },
        { /* i_total 8 */
            { 0x1c, 6 }, /* str=011100 */
            { 0x1d, 6 }, /* str=011101 */
            { 0x1e, 6 }, /* str=011110 */
            { 0x1f, 6 }, /* str=011111 */
        },
        { /* i_total 9 */
            { 0x20, 6 }, /* str=100000 */
            { 0x21, 6 }, /* str=100001 */
            { 0x22, 6 }, /* str=100010 */
            { 0x23, 6 }, /* str=100011 */
        },
        { /* i_total 10 */
            { 0x24, 6 }, /* str=100100 */
            { 0x25, 6 }, /* str=100101 */
            { 0x26, 6 }, /* str=100110 */
            { 0x27, 6 }, /* str=100111 */
        },
        { /* i_total 11 */
            { 0x28, 6 }, /* str=101000 */
            { 0x29, 6 }, /* str=101001 */
            { 0x2a, 6 }, /* str=101010 */
            { 0x2b, 6 }, /* str=101011 */
        },
        { /* i_total 12 */
            { 0x2c, 6 }, /* str=101100 */
            { 0x2d, 6 }, /* str=101101 */
            { 0x2e, 6 }, /* str=101110 */
            { 0x2f, 6 }, /* str=101111 */
        },
        { /* i_total 13 */
            { 0x30, 6 }, /* str=110000 */
            { 0x31, 6 }, /* str=110001 */
            { 0x32, 6 }, /* str=110010 */
            { 0x33, 6 }, /* str=110011 */
        },
        { /* i_total 14 */
            { 0x34, 6 }, /* str=110100 */
            { 0x35, 6 }, /* str=110101 */
            { 0x36, 6 }, /* str=110110 */
            { 0x37, 6 }, /* str=110111 */
        },
        { /* i_total 15 */
            { 0x38, 6 }, /* str=111000 */
            { 0x39, 6 }, /* str=111001 */
            { 0x3a, 6 }, /* str=111010 */
            { 0x3b, 6 }, /* str=111011 */
        },
        { /* i_total 16 */
            { 0x3c, 6 }, /* str=111100 */
            { 0x3d, 6 }, /* str=111101 */
            { 0x3e, 6 }, /* str=111110 */
            { 0x3f, 6 }, /* str=111111 */
        },
    },
    { /* table 4 */
        { /* i_total 1 */
            { 0x7, 6 }, /* str=000111 */
            { 0x1, 1 }, /* str=1 */
        },
        { /* i_total 2 */
            { 0x4, 6 }, /* str=000100 */
            { 0x6, 6 }, /* str=000110 */
            { 0x1, 3 }, /* str=001 */
        },
        { /* i_total 3 */
            { 0x3, 6 }, /* str=000011 */
            { 0x3, 7 }, /* str=0000011 */
            { 0x2, 7 }, /* str=0000010 */
            { 0x5, 6 }, /* str=000101 */
        },
        { /* i_total 4 */
            { 0x2, 6 }, /* str=000010 */
            { 0x3, 8 }, /* str=00000011 */
            { 0x2, 8 }, /* str=00000010 */
            { 0x0, 7 }, /* str=0000000 */
        },
    },
    { /* table 5 */
        { /* i_total 1 */
            { 0xf, 7 }, /* str=0001111 */
            { 0x1, 2 }, /* str=01 */
        },
        { /* i_total 2 */
            { 0xe, 7 }, /* str=0001110 */
            { 0xd, 7 }, /* str=0001101 */
            { 0x1, 3 }, /* str=001 */
        },
        { /* i_total 3 */
            { 0x7, 9 }, /* str=000000111 */
            { 0xc, 7 }, /* str=0001100 */
            { 0xb, 7 }, /* str=0001011 */
            { 0x1, 5 }, /* str=00001 */
        },
        { /* i_total 4 */
            { 0x6, 9 }, /* str=000000110 */
            { 0x5, 9 }, /* str=000000101 */
            { 0xa, 7 }, /* str=0001010 */
            { 0x1, 6 }, /* str=000001 */
        },
        { /* i_total 5 */
            { 0x7, 10 }, /* str=0000000111 */
            { 0x6, 10 }, /* str=0000000110 */
            { 0x4, 9 },  /* str=000000100 */
            { 0x9, 7 },  /* str=0001001 */
        },
        { /* i_total 6 */
            { 0x7, 11 }, /* str=00000000111 */
            { 0x6, 11 }, /* str=00000000110 */
            { 0x5, 10 }, /* str=0000000101 */
            { 0x8, 7 },  /* str=0001000 */
        },
        { /* i_total 7 */
            { 0x7, 12 }, /* str=000000000111 */
            { 0x6, 12 }, /* str=000000000110 */
            { 0x5, 11 }, /* str=00000000101 */
            { 0x4, 10 }, /* str=0000000100 */
        },
        { /* i_total 8 */
            { 0x7, 13 }, /* str=0000000000111 */
            { 0x5, 12 }, /* str=000000000101 */
            { 0x4, 12 }, /* str=000000000100 */
            { 0x4, 11 }, /* str=00000000100 */
        },
    },
};

/* [i_total_coeff-1][i_total_zeros] */
const vlc_t x264_total_zeros[15][16] =
{
    { /* i_total 1 */
        { 0x1, 1 }, /* str=1 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 3 }, /* str=010 */
        { 0x3, 4 }, /* str=0011 */
        { 0x2, 4 }, /* str=0010 */
        { 0x3, 5 }, /* str=00011 */
        { 0x2, 5 }, /* str=00010 */
        { 0x3, 6 }, /* str=000011 */
        { 0x2, 6 }, /* str=000010 */
        { 0x3, 7 }, /* str=0000011 */
        { 0x2, 7 }, /* str=0000010 */
        { 0x3, 8 }, /* str=00000011 */
        { 0x2, 8 }, /* str=00000010 */
        { 0x3, 9 }, /* str=000000011 */
        { 0x2, 9 }, /* str=000000010 */
        { 0x1, 9 }, /* str=000000001 */
    },
    { /* i_total 2 */
        { 0x7, 3 }, /* str=111 */
        { 0x6, 3 }, /* str=110 */
        { 0x5, 3 }, /* str=101 */
        { 0x4, 3 }, /* str=100 */
        { 0x3, 3 }, /* str=011 */
        { 0x5, 4 }, /* str=0101 */
        { 0x4, 4 }, /* str=0100 */
        { 0x3, 4 }, /* str=0011 */
        { 0x2, 4 }, /* str=0010 */
        { 0x3, 5 }, /* str=00011 */
        { 0x2, 5 }, /* str=00010 */
        { 0x3, 6 }, /* str=000011 */
        { 0x2, 6 }, /* str=000010 */
        { 0x1, 6 }, /* str=000001 */
        { 0x0, 6 }, /* str=000000 */
    },
    { /* i_total 3 */
        { 0x5, 4 }, /* str=0101 */
        { 0x7, 3 }, /* str=111 */
        { 0x6, 3 }, /* str=110 */
        { 0x5, 3 }, /* str=101 */
        { 0x4, 4 }, /* str=0100 */
        { 0x3, 4 }, /* str=0011 */
        { 0x4, 3 }, /* str=100 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 4 }, /* str=0010 */
        { 0x3, 5 }, /* str=00011 */
        { 0x2, 5 }, /* str=00010 */
        { 0x1, 6 }, /* str=000001 */
        { 0x1, 5 }, /* str=00001 */
        { 0x0, 6 }, /* str=000000 */
    },
    { /* i_total 4 */
        { 0x3, 5 }, /* str=00011 */
        { 0x7, 3 }, /* str=111 */
        { 0x5, 4 }, /* str=0101 */
        { 0x4, 4 }, /* str=0100 */
        { 0x6, 3 }, /* str=110 */
        { 0x5, 3 }, /* str=101 */
        { 0x4, 3 }, /* str=100 */
        { 0x3, 4 }, /* str=0011 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 4 }, /* str=0010 */
        { 0x2, 5 }, /* str=00010 */
        { 0x1, 5 }, /* str=00001 */
        { 0x0, 5 }, /* str=00000 */
    },
    { /* i_total 5 */
        { 0x5, 4 }, /* str=0101 */
        { 0x4, 4 }, /* str=0100 */
        { 0x3, 4 }, /* str=0011 */
        { 0x7, 3 }, /* str=111 */
        { 0x6, 3 }, /* str=110 */
        { 0x5, 3 }, /* str=101 */
        { 0x4, 3 }, /* str=100 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 4 }, /* str=0010 */
        { 0x1, 5 }, /* str=00001 */
        { 0x1, 4 }, /* str=0001 */
        { 0x0, 5 }, /* str=00000 */
    },
    { /* i_total 6 */
        { 0x1, 6 }, /* str=000001 */
        { 0x1, 5 }, /* str=00001 */
        { 0x7, 3 }, /* str=111 */
        { 0x6, 3 }, /* str=110 */
        { 0x5, 3 }, /* str=101 */
        { 0x4, 3 }, /* str=100 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 3 }, /* str=010 */
        { 0x1, 4 }, /* str=0001 */
        { 0x1, 3 }, /* str=001 */
        { 0x0, 6 }, /* str=000000 */
    },
    { /* i_total 7 */
        { 0x1, 6 }, /* str=000001 */
        { 0x1, 5 }, /* str=00001 */
        { 0x5, 3 }, /* str=101 */
        { 0x4, 3 }, /* str=100 */
        { 0x3, 3 }, /* str=011 */
        { 0x3, 2 }, /* str=11 */
        { 0x2, 3 }, /* str=010 */
        { 0x1, 4 }, /* str=0001 */
        { 0x1, 3 }, /* str=001 */
        { 0x0, 6 }, /* str=000000 */
    },
    { /* i_total 8 */
        { 0x1, 6 }, /* str=000001 */
        { 0x1, 4 }, /* str=0001 */
        { 0x1, 5 }, /* str=00001 */
        { 0x3, 3 }, /* str=011 */
        { 0x3, 2 }, /* str=11 */
        { 0x2, 2 }, /* str=10 */
        { 0x2, 3 }, /* str=010 */
        { 0x1, 3 }, /* str=001 */
        { 0x0, 6 }, /* str=000000 */
    },
    { /* i_total 9 */
        { 0x1, 6 }, /* str=000001 */
        { 0x0, 6 }, /* str=000000 */
        { 0x1, 4 }, /* str=0001 */
        { 0x3, 2 }, /* str=11 */
        { 0x2, 2 }, /* str=10 */
        { 0x1, 3 }, /* str=001 */
        { 0x1, 2 }, /* str=01 */
        { 0x1, 5 }, /* str=00001 */
    },
    { /* i_total 10 */
        { 0x1, 5 }, /* str=00001 */
        { 0x0, 5 }, /* str=00000 */
        { 0x1, 3 }, /* str=001 */
        { 0x3, 2 }, /* str=11 */
        { 0x2, 2 }, /* str=10 */
        { 0x1, 2 }, /* str=01 */
        { 0x1, 4 }, /* str=0001 */
    },
    { /* i_total 11 */
        { 0x0, 4 }, /* str=0000 */
        { 0x1, 4 }, /* str=0001 */
        { 0x1, 3 }, /* str=001 */
        { 0x2, 3 }, /* str=010 */
        { 0x1, 1 }, /* str=1 */
        { 0x3, 3 }, /* str=011 */
    },
    { /* i_total 12 */
        { 0x0, 4 }, /* str=0000 */
        { 0x1, 4 }, /* str=0001 */
        { 0x1, 2 }, /* str=01 */
        { 0x1, 1 }, /* str=1 */
        { 0x1, 3 }, /* str=001 */
    },
    { /* i_total 13 */
        { 0x0, 3 }, /* str=000 */
        { 0x1, 3 }, /* str=001 */
        { 0x1, 1 }, /* str=1 */
        { 0x1, 2 }, /* str=01 */
    },
    { /* i_total 14 */
        { 0x0, 2 }, /* str=00 */
        { 0x1, 2 }, /* str=01 */
        { 0x1, 1 }, /* str=1 */
    },
    { /* i_total 15 */
        { 0x0, 1 }, /* str=0 */
        { 0x1, 1 }, /* str=1 */
    },
};

/* [i_total_coeff-1][i_total_zeros] */
const vlc_t x264_total_zeros_2x2_dc[3][4] =
{
    { /* i_total 1 */
        { 0x1, 1 }, /* str=1 */
        { 0x1, 2 }, /* str=01 */
        { 0x1, 3 }, /* str=001 */
        { 0x0, 3 }  /* str=000 */
    },
    { /* i_total 2 */
        { 0x1, 1 }, /* str=1 */
        { 0x1, 2 }, /* str=01 */
        { 0x0, 2 }, /* str=00 */
    },
    { /* i_total 3 */
        { 0x1, 1 }, /* str=1 */
        { 0x0, 1 }, /* str=0 */
    },
};

/* [i_total_coeff-1][i_total_zeros] */
const vlc_t x264_total_zeros_2x4_dc[7][8] =
{
    { /* i_total 1 */
        { 0x1, 1 }, /* str=1 */
        { 0x2, 3 }, /* str=010 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 4 }, /* str=0010 */
        { 0x3, 4 }, /* str=0011 */
        { 0x1, 4 }, /* str=0001 */
        { 0x1, 5 }, /* str=00001 */
        { 0x0, 5 }, /* str=00000 */
    },
    { /* i_total 2 */
        { 0x0, 3 }, /* str=000 */
        { 0x1, 2 }, /* str=01 */
        { 0x1, 3 }, /* str=001 */
        { 0x4, 3 }, /* str=100 */
        { 0x5, 3 }, /* str=101 */
        { 0x6, 3 }, /* str=110 */
        { 0x7, 3 }, /* str=111 */
    },
    { /* i_total 3 */
        { 0x0, 3 }, /* str=000 */
        { 0x1, 3 }, /* str=001 */
        { 0x1, 2 }, /* str=01 */
        { 0x2, 2 }, /* str=10 */
        { 0x6, 3 }, /* str=110 */
        { 0x7, 3 }, /* str=111 */
    },
    { /* i_total 4 */
        { 0x6, 3 }, /* str=110 */
        { 0x0, 2 }, /* str=00 */
        { 0x1, 2 }, /* str=01 */
        { 0x2, 2 }, /* str=10 */
        { 0x7, 3 }, /* str=111 */
    },
    { /* i_total 5 */
        { 0x0, 2 }, /* str=00 */
        { 0x1, 2 }, /* str=01 */
        { 0x2, 2 }, /* str=10 */
        { 0x3, 2 }, /* str=11 */
    },
    { /* i_total 6 */
        { 0x0, 2 }, /* str=00 */
        { 0x1, 2 }, /* str=01 */
        { 0x1, 1 }, /* str=1 */
    },
    { /* i_total 7 */
        { 0x0, 1 }, /* str=0 */
        { 0x1, 1 }, /* str=1 */
    }
};

/* [MIN( i_zero_left-1, 6 )][run_before] */
const vlc_t x264_run_before[7][16] =
{
    { /* i_zero_left 1 */
        { 0x1, 1 }, /* str=1 */
        { 0x0, 1 }, /* str=0 */
    },
    { /* i_zero_left 2 */
        { 0x1, 1 }, /* str=1 */
        { 0x1, 2 }, /* str=01 */
        { 0x0, 2 }, /* str=00 */
    },
    { /* i_zero_left 3 */
        { 0x3, 2 }, /* str=11 */
        { 0x2, 2 }, /* str=10 */
        { 0x1, 2 }, /* str=01 */
        { 0x0, 2 }, /* str=00 */
    },
    { /* i_zero_left 4 */
        { 0x3, 2 }, /* str=11 */
        { 0x2, 2 }, /* str=10 */
        { 0x1, 2 }, /* str=01 */
        { 0x1, 3 }, /* str=001 */
        { 0x0, 3 }, /* str=000 */
    },
    { /* i_zero_left 5 */
        { 0x3, 2 }, /* str=11 */
        { 0x2, 2 }, /* str=10 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 3 }, /* str=010 */
        { 0x1, 3 }, /* str=001 */
        { 0x0, 3 }, /* str=000 */
    },
    { /* i_zero_left 6 */
        { 0x3, 2 }, /* str=11 */
        { 0x0, 3 }, /* str=000 */
        { 0x1, 3 }, /* str=001 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 3 }, /* str=010 */
        { 0x5, 3 }, /* str=101 */
        { 0x4, 3 }, /* str=100 */
    },
    { /* i_zero_left >6 */
        { 0x7, 3 }, /* str=111 */
        { 0x6, 3 }, /* str=110 */
        { 0x5, 3 }, /* str=101 */
        { 0x4, 3 }, /* str=100 */
        { 0x3, 3 }, /* str=011 */
        { 0x2, 3 }, /* str=010 */
        { 0x1, 3 }, /* str=001 */
        { 0x1, 4 }, /* str=0001 */
        { 0x1, 5 }, /* str=00001 */
        { 0x1, 6 }, /* str=000001 */
        { 0x1, 7 }, /* str=0000001 */
        { 0x1, 8 }, /* str=00000001 */
        { 0x1, 9 }, /* str=000000001 */
        { 0x1, 10 }, /* str=0000000001 */
        { 0x1, 11 }, /* str=00000000001 */
    },
};

vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];

void x264_cavlc_init( void )
{
    for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
        for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
        {
            int mask = level >> 15;
            int abs_level = (level^mask)-mask;
            int i_level_code = abs_level*2-mask-2;
            int i_next = i_suffix;
            vlc_large_t *vlc = &x264_level_token[i_suffix][level+LEVEL_TABLE_SIZE/2];

            if( ( i_level_code >> i_suffix ) < 14 )
            {
                vlc->i_size = (i_level_code >> i_suffix) + 1 + i_suffix;
                vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
            }
            else if( i_suffix == 0 && i_level_code < 30 )
            {
                vlc->i_size = 19;
                vlc->i_bits = (1<<4) + (i_level_code - 14);
            }
            else if( i_suffix > 0 && ( i_level_code >> i_suffix ) == 14 )
            {
                vlc->i_size = 15 + i_suffix;
                vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
            }
            else
            {
                i_level_code -= 15 << i_suffix;
                if( i_suffix == 0 )
                    i_level_code -= 15;
                vlc->i_size = 28;
                vlc->i_bits = (1<<12) + i_level_code;
            }
            if( i_next == 0 )
                i_next++;
            if( abs_level > (3 << (i_next-1)) && i_next < 6 )
                i_next++;
            vlc->i_next = i_next;
        }
}
x264-snapshot-20120103-2245-stable/common/visualize.h0000644000175000001440000000266011700673342021351 0ustar  videolanusers/*****************************************************************************
 * visualize.h: visualization
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Tuukka Toivonen <tuukkat@ee.oulu.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_VISUALIZE_H
#define X264_VISUALIZE_H

#include "common/common.h"

int  x264_visualize_init( x264_t *h );
void x264_visualize_mb( x264_t *h );
void x264_visualize_show( x264_t *h );
void x264_visualize_close( x264_t *h );

#endif
x264-snapshot-20120103-2245-stable/common/visualize.c0000644000175000001440000003437411700673342021353 0ustar  videolanusers/*****************************************************************************
 * visualize.c: visualization
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Tuukka Toivonen <tuukkat@ee.oulu.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

/*
 * Some explanation of the symbols used:
 * Red/pink: intra block
 * Blue: inter block
 * Green: skip block
 * Yellow: B-block (not visualized properly yet)
 *
 * Motion vectors have black dot at their target (ie. at the MB center),
 * instead of arrowhead. The black dot is enclosed in filled diamond with radius
 * depending on reference frame number (one frame back = zero width, normal case).
 *
 * The intra blocks have generally lines drawn perpendicular
 * to the prediction direction, so for example, if there is a pink block
 * with horizontal line at the top of it, it is interpolated by assuming
 * luma to be vertically constant.
 * DC predicted blocks have both horizontal and vertical lines,
 * pink blocks with a diagonal line are predicted using the planar function.
 */

#include "common.h"
#include "visualize.h"
#include "display.h"

typedef struct
{
    int     i_type;
    int     i_partition;
    int     i_sub_partition[4];
    int     i_intra16x16_pred_mode;
    int     intra4x4_pred_mode[4][4];
    int8_t  ref[2][4][4];                  /* [list][y][x] */
    int16_t mv[2][4][4][2];                /* [list][y][x][mvxy] */
} visualize_t;

/* Return string from stringlist corresponding to the given code */
#define GET_STRING(sl, code) get_string((sl), sizeof(sl)/sizeof(*(sl)), code)

typedef struct
{
    int code;
    char *string;
} stringlist_t;

static char *get_string( const stringlist_t *sl, int entries, int code )
{
    for( int i = 0; i < entries; i++ )
        if( sl[i].code == code )
            return sl[i].string;
    return "?";
}

/* Plot motion vector */
static void mv( int x0, int y0, int16_t dmv[2], int ref, int zoom, char *col )
{
    int dx = dmv[0];
    int dy = dmv[1];

    dx = (dx * zoom + 2) >> 2;
    dy = (dy * zoom + 2) >> 2;
    disp_line( 0, x0, y0, x0+dx, y0+dy );
    for( int i = 1; i < ref; i++ )
    {
        disp_line( 0, x0  , y0-i, x0+i, y0   );
        disp_line( 0, x0+i, y0  , x0  , y0+i );
        disp_line( 0, x0  , y0+i, x0-i, y0   );
        disp_line( 0, x0-i, y0  , x0  , y0-i );
    }
    disp_setcolor( "black" );
    disp_point( 0, x0, y0 );
    disp_setcolor( col );
}

int x264_visualize_init( x264_t *h )
{
    CHECKED_MALLOC( h->visualize, h->mb.i_mb_width * h->mb.i_mb_height * sizeof(visualize_t) );
    return 0;
fail:
    return -1;
}

void x264_visualize_mb( x264_t *h )
{
    visualize_t *v = (visualize_t*)h->visualize + h->mb.i_mb_xy;

    /* Save all data for the MB that we need for drawing the visualization */
    v->i_type = h->mb.i_type;
    v->i_partition = h->mb.i_partition;
    for( int i = 0; i < 4; i++ )
        v->i_sub_partition[i] = h->mb.i_sub_partition[i];
    for( int y = 0; y < 4; y++ )
        for( int x = 0; x < 4; x++ )
            v->intra4x4_pred_mode[y][x] = h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+y*8+x];
    for( int l = 0; l < 2; l++ )
        for( int y = 0; y < 4; y++ )
            for( int x = 0; x < 4; x++ )
            {
                for( int i = 0; i < 2; i++ )
                    v->mv[l][y][x][i] = h->mb.cache.mv[l][X264_SCAN8_0+y*8+x][i];
                v->ref[l][y][x] = h->mb.cache.ref[l][X264_SCAN8_0+y*8+x];
            }
    v->i_intra16x16_pred_mode = h->mb.i_intra16x16_pred_mode;
}

void x264_visualize_close( x264_t *h )
{
    x264_free(h->visualize);
}

/* Display visualization (block types, MVs) of the encoded frame */
/* FIXME: B-type MBs not handled yet properly */
void x264_visualize_show( x264_t *h )
{
    static const stringlist_t mb_types[] =
    {
        /* Block types marked as NULL will not be drawn */
        { I_4x4   , "red" },
        { I_8x8   , "#ff5640" },
        { I_16x16 , "#ff8060" },
        { I_PCM   , "violet" },
        { P_L0    , "SlateBlue" },
        { P_8x8   , "blue" },
        { P_SKIP  , "green" },
        { B_DIRECT, "yellow" },
        { B_L0_L0 , "yellow" },
        { B_L0_L1 , "yellow" },
        { B_L0_BI , "yellow" },
        { B_L1_L0 , "yellow" },
        { B_L1_L1 , "yellow" },
        { B_L1_BI , "yellow" },
        { B_BI_L0 , "yellow" },
        { B_BI_L1 , "yellow" },
        { B_BI_BI , "yellow" },
        { B_8x8   , "yellow" },
        { B_SKIP  , "yellow" },
    };

    static const int waitkey = 1;     /* Wait for enter after each frame */
    static const int drawbox = 1;     /* Draw box around each block */
    static const int borders = 0;     /* Display extrapolated borders outside frame */
    static const int zoom = 2;        /* Zoom factor */

    static const int pad = 32;
    pixel *const frame = h->fdec->plane[0];
    const int width = h->param.i_width;
    const int height = h->param.i_height;
    const int stride = h->fdec->i_stride[0];

    if( borders )
        disp_gray_zoom( 0, frame - pad*stride - pad, width+2*pad, height+2*pad, stride, "fdec", zoom );
    else
        disp_gray_zoom( 0, frame, width, height, stride, "fdec", zoom );

    for( int mb_xy = 0; mb_xy < h->mb.i_mb_width * h->mb.i_mb_height; mb_xy++ )
    {
        visualize_t *const v = (visualize_t*)h->visualize + mb_xy;
        const int mb_y = mb_xy / h->mb.i_mb_width;
        const int mb_x = mb_xy % h->mb.i_mb_width;
        char *const col = GET_STRING( mb_types, v->i_type );
        int x = mb_x*16*zoom;
        int y = mb_y*16*zoom;
        int l = 0;

        if( !col )
            continue;

        if( borders )
        {
            x += pad*zoom;
            y += pad*zoom;
        }

        disp_setcolor( col );
        if( drawbox ) disp_rect( 0, x, y, x+16*zoom-1, y+16*zoom-1 );

        if( v->i_type==P_L0 || v->i_type==P_8x8 || v->i_type==P_SKIP )
        {
            /* Predicted (inter) mode, with motion vector */
            if( v->i_partition == D_16x16 || v->i_type == P_SKIP )
                mv( x+8*zoom, y+8*zoom, v->mv[l][0][0], v->ref[l][0][0], zoom, col );
            else if (v->i_partition == D_16x8)
            {
                if( drawbox ) disp_rect( 0, x, y, x+16*zoom, y+8*zoom );
                mv( x+8*zoom, y+4*zoom, v->mv[l][0][0], v->ref[l][0][0], zoom, col );
                if( drawbox ) disp_rect( 0, x, y+8*zoom, x+16*zoom, y+16*zoom );
                mv( x+8*zoom, y+12*zoom, v->mv[l][2][0], v->ref[l][2][0], zoom, col );
            }
            else if( v->i_partition==D_8x16 )
            {
                if( drawbox ) disp_rect( 0, x,          y, x+8*zoom,  y+16*zoom );
                mv( x+4*zoom, y+8*zoom, v->mv[l][0][0], v->ref[l][0][0], zoom, col );
                if( drawbox ) disp_rect( 0, x+8*zoom,   y, x+16*zoom, y+16*zoom );
                mv( x+12*zoom, y+8*zoom, v->mv[l][0][2], v->ref[l][0][2], zoom, col );
            }
            else if( v->i_partition==D_8x8 )
            {
                for( int i = 0; i < 2; i++ )
                    for( int j = 0; j < 2; j++ )
                    {
                        int sp = v->i_sub_partition[i*2+j];
                        const int x0 = x + j*8*zoom;
                        const int y0 = y + i*8*zoom;
                        l = x264_mb_partition_listX_table[0][sp] ? 0 : 1; /* FIXME: not tested if this works */
                        if( IS_SUB8x8(sp) )
                        {
                            if( drawbox ) disp_rect( 0, x0, y0, x0+8*zoom, y0+8*zoom );
                            mv( x0+4*zoom, y0+4*zoom, v->mv[l][2*i][2*j], v->ref[l][2*i][2*j], zoom, col );
                        }
                        else if( IS_SUB8x4(sp) )
                        {
                            if( drawbox ) disp_rect( 0, x0, y0, x0+8*zoom, y0+4*zoom );
                            if( drawbox ) disp_rect( 0, x0, y0+4*zoom, x0+8*zoom, y0+8*zoom );
                            mv( x0+4*zoom, y0+2*zoom, v->mv[l][2*i][2*j], v->ref[l][2*i][2*j], zoom, col );
                            mv( x0+4*zoom, y0+6*zoom, v->mv[l][2*i+1][2*j], v->ref[l][2*i+1][2*j], zoom, col );
                        }
                        else if( IS_SUB4x8(sp) )
                        {
                            if( drawbox ) disp_rect( 0, x0, y0, x0+4*zoom, y0+8*zoom );
                            if( drawbox ) disp_rect( 0, x0+4*zoom, y0, x0+8*zoom, y0+8*zoom );
                            mv( x0+2*zoom, y0+4*zoom, v->mv[l][2*i][2*j], v->ref[l][2*i][2*j], zoom, col );
                            mv( x0+6*zoom, y0+4*zoom, v->mv[l][2*i][2*j+1], v->ref[l][2*i][2*j+1], zoom, col );
                        }
                        else if( IS_SUB4x4(sp) )
                        {
                            if( drawbox ) disp_rect( 0, x0, y0, x0+4*zoom, y0+4*zoom );
                            if( drawbox ) disp_rect( 0, x0+4*zoom, y0, x0+8*zoom, y0+4*zoom );
                            if( drawbox ) disp_rect( 0, x0, y0+4*zoom, x0+4*zoom, y0+8*zoom );
                            if( drawbox ) disp_rect( 0, x0+4*zoom, y0+4*zoom, x0+8*zoom, y0+8*zoom );
                            mv( x0+2*zoom, y0+2*zoom, v->mv[l][2*i][2*j], v->ref[l][2*i][2*j], zoom, col );
                            mv( x0+6*zoom, y0+2*zoom, v->mv[l][2*i][2*j+1], v->ref[l][2*i][2*j+1], zoom, col );
                            mv( x0+2*zoom, y0+6*zoom, v->mv[l][2*i+1][2*j], v->ref[l][2*i+1][2*j], zoom, col );
                            mv( x0+6*zoom, y0+6*zoom, v->mv[l][2*i+1][2*j+1], v->ref[l][2*i+1][2*j+1], zoom, col );
                        }
                    }
            }
        }

        if( IS_INTRA(v->i_type) || v->i_type == I_PCM )
        {
            /* Intra coded */
            if( v->i_type == I_16x16 )
            {
                switch (v->i_intra16x16_pred_mode) {
                case I_PRED_16x16_V:
                    disp_line( 0, x+2*zoom, y+2*zoom, x+14*zoom, y+2*zoom );
                    break;
                case I_PRED_16x16_H:
                    disp_line( 0, x+2*zoom, y+2*zoom, x+2*zoom, y+14*zoom );
                    break;
                case I_PRED_16x16_DC:
                case I_PRED_16x16_DC_LEFT:
                case I_PRED_16x16_DC_TOP:
                case I_PRED_16x16_DC_128:
                    disp_line( 0, x+2*zoom, y+2*zoom, x+14*zoom, y+2*zoom );
                    disp_line( 0, x+2*zoom, y+2*zoom, x+2*zoom, y+14*zoom );
                    break;
                case I_PRED_16x16_P:
                    disp_line( 0, x+2*zoom, y+2*zoom, x+8*zoom, y+8*zoom );
                    break;
                }
            }
            if( v->i_type==I_4x4 || v->i_type==I_8x8 )
            {
                const int di = v->i_type == I_8x8 ? 2 : 1;
                const int zoom2 = zoom * di;
                for( int i = 0; i < 4; i += di )
                    for( int j = 0; j < 4; j += di )
                    {
                        const int x0 = x + j*4*zoom;
                        const int y0 = y + i*4*zoom;
                        if( drawbox ) disp_rect( 0, x0, y0, x0+4*zoom2, y0+4*zoom2 );
                        switch( v->intra4x4_pred_mode[i][j] )
                        {
                            case I_PRED_4x4_V:        /* Vertical */
                                disp_line( 0, x0+0*zoom2, y0+1*zoom2, x0+4*zoom2, y0+1*zoom2 );
                                break;
                            case I_PRED_4x4_H:        /* Horizontal */
                                disp_line( 0, x0+1*zoom2, y0+0*zoom2, x0+1*zoom2, y0+4*zoom2 );
                                break;
                            case I_PRED_4x4_DC:        /* DC, average from top and left sides */
                            case I_PRED_4x4_DC_LEFT:
                            case I_PRED_4x4_DC_TOP:
                            case I_PRED_4x4_DC_128:
                                disp_line( 0, x0+1*zoom2, y0+1*zoom2, x0+4*zoom2, y0+1*zoom2 );
                                disp_line( 0, x0+1*zoom2, y0+1*zoom2, x0+1*zoom2, y0+4*zoom2 );
                                break;
                            case I_PRED_4x4_DDL:    /* Topright-bottomleft */
                                disp_line( 0, x0+0*zoom2, y0+0*zoom2, x0+4*zoom2, y0+4*zoom2 );
                                break;
                            case I_PRED_4x4_DDR:    /* Topleft-bottomright */
                                disp_line( 0, x0+0*zoom2, y0+4*zoom2, x0+4*zoom2, y0+0*zoom2 );
                                break;
                            case I_PRED_4x4_VR:        /* Mix of topleft-bottomright and vertical */
                                disp_line( 0, x0+0*zoom2, y0+2*zoom2, x0+4*zoom2, y0+1*zoom2 );
                                break;
                            case I_PRED_4x4_HD:        /* Mix of topleft-bottomright and horizontal */
                                disp_line( 0, x0+2*zoom2, y0+0*zoom2, x0+1*zoom2, y0+4*zoom2 );
                                break;
                            case I_PRED_4x4_VL:        /* Mix of topright-bottomleft and vertical */
                                disp_line( 0, x0+0*zoom2, y0+1*zoom2, x0+4*zoom2, y0+2*zoom2 );
                                break;
                            case I_PRED_4x4_HU:        /* Mix of topright-bottomleft and horizontal */
                                disp_line( 0, x0+1*zoom2, y0+0*zoom2, x0+2*zoom2, y0+4*zoom2 );
                                break;
                        }
                    }
            }
        }
    }

    disp_sync();
    if( waitkey )
        getchar();
}
/* }}} */

//EOF
x264-snapshot-20120103-2245-stable/common/threadpool.h0000644000175000001440000000350711700673342021500 0ustar  videolanusers/*****************************************************************************
 * threadpool.h: thread pooling
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_THREADPOOL_H
#define X264_THREADPOOL_H

typedef struct x264_threadpool_t x264_threadpool_t;

#if HAVE_THREAD
int   x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
                            void (*init_func)(void *), void *init_arg );
void  x264_threadpool_run( x264_threadpool_t *pool, void *(*func)(void *), void *arg );
void *x264_threadpool_wait( x264_threadpool_t *pool, void *arg );
void  x264_threadpool_delete( x264_threadpool_t *pool );
#else
#define x264_threadpool_init(p,t,f,a) -1
#define x264_threadpool_run(p,f,a)
#define x264_threadpool_wait(p,a)     NULL
#define x264_threadpool_delete(p)
#endif

#endif
x264-snapshot-20120103-2245-stable/common/threadpool.c0000644000175000001440000001277611700673342021503 0ustar  videolanusers/*****************************************************************************
 * threadpool.c: thread pooling
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

typedef struct
{
    void *(*func)(void *);
    void *arg;
    void *ret;
} x264_threadpool_job_t;

struct x264_threadpool_t
{
    int            exit;
    int            threads;
    x264_pthread_t *thread_handle;
    void           (*init_func)(void *);
    void           *init_arg;

    /* requires a synchronized list structure and associated methods,
       so use what is already implemented for frames */
    x264_sync_frame_list_t uninit; /* list of jobs that are awaiting use */
    x264_sync_frame_list_t run;    /* list of jobs that are queued for processing by the pool */
    x264_sync_frame_list_t done;   /* list of jobs that have finished processing */
};

static void x264_threadpool_thread( x264_threadpool_t *pool )
{
    if( pool->init_func )
        pool->init_func( pool->init_arg );

    while( !pool->exit )
    {
        x264_threadpool_job_t *job = NULL;
        x264_pthread_mutex_lock( &pool->run.mutex );
        while( !pool->exit && !pool->run.i_size )
            x264_pthread_cond_wait( &pool->run.cv_fill, &pool->run.mutex );
        if( pool->run.i_size )
        {
            job = (void*)x264_frame_shift( pool->run.list );
            pool->run.i_size--;
        }
        x264_pthread_mutex_unlock( &pool->run.mutex );
        if( !job )
            continue;
        job->ret = job->func( job->arg ); /* execute the function */
        x264_sync_frame_list_push( &pool->done, (void*)job );
    }
}

int x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
                          void (*init_func)(void *), void *init_arg )
{
    if( threads <= 0 )
        return -1;

    x264_threadpool_t *pool;
    CHECKED_MALLOCZERO( pool, sizeof(x264_threadpool_t) );
    *p_pool = pool;

    pool->init_func = init_func;
    pool->init_arg  = init_arg;
    pool->threads   = X264_MIN( threads, X264_THREAD_MAX );

    CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) );

    if( x264_sync_frame_list_init( &pool->uninit, pool->threads ) ||
        x264_sync_frame_list_init( &pool->run, pool->threads ) ||
        x264_sync_frame_list_init( &pool->done, pool->threads ) )
        goto fail;

    for( int i = 0; i < pool->threads; i++ )
    {
       x264_threadpool_job_t *job;
       CHECKED_MALLOC( job, sizeof(x264_threadpool_job_t) );
       x264_sync_frame_list_push( &pool->uninit, (void*)job );
    }
    for( int i = 0; i < pool->threads; i++ )
        if( x264_pthread_create( pool->thread_handle+i, NULL, (void*)x264_threadpool_thread, pool ) )
            goto fail;

    return 0;
fail:
    return -1;
}

void x264_threadpool_run( x264_threadpool_t *pool, void *(*func)(void *), void *arg )
{
    x264_threadpool_job_t *job = (void*)x264_sync_frame_list_pop( &pool->uninit );
    job->func = func;
    job->arg  = arg;
    x264_sync_frame_list_push( &pool->run, (void*)job );
}

void *x264_threadpool_wait( x264_threadpool_t *pool, void *arg )
{
    x264_threadpool_job_t *job = NULL;

    x264_pthread_mutex_lock( &pool->done.mutex );
    while( !job )
    {
        for( int i = 0; i < pool->done.i_size; i++ )
        {
            x264_threadpool_job_t *t = (void*)pool->done.list[i];
            if( t->arg == arg )
            {
                job = (void*)x264_frame_shift( pool->done.list+i );
                pool->done.i_size--;
            }
        }
        if( !job )
            x264_pthread_cond_wait( &pool->done.cv_fill, &pool->done.mutex );
    }
    x264_pthread_mutex_unlock( &pool->done.mutex );

    void *ret = job->ret;
    x264_sync_frame_list_push( &pool->uninit, (void*)job );
    return ret;
}

static void x264_threadpool_list_delete( x264_sync_frame_list_t *slist )
{
    for( int i = 0; slist->list[i]; i++ )
    {
        x264_free( slist->list[i] );
        slist->list[i] = NULL;
    }
    x264_sync_frame_list_delete( slist );
}

void x264_threadpool_delete( x264_threadpool_t *pool )
{
    x264_pthread_mutex_lock( &pool->run.mutex );
    pool->exit = 1;
    x264_pthread_cond_broadcast( &pool->run.cv_fill );
    x264_pthread_mutex_unlock( &pool->run.mutex );
    for( int i = 0; i < pool->threads; i++ )
        x264_pthread_join( pool->thread_handle[i], NULL );

    x264_threadpool_list_delete( &pool->uninit );
    x264_threadpool_list_delete( &pool->run );
    x264_threadpool_list_delete( &pool->done );
    x264_free( pool->thread_handle );
    x264_free( pool );
}
x264-snapshot-20120103-2245-stable/common/sparc/0000755000175000001440000000000011700673342020271 5ustar  videolanusersx264-snapshot-20120103-2245-stable/common/sparc/pixel.h0000644000175000001440000000277011700673342021571 0ustar  videolanusers/*****************************************************************************
 * pixel.h: sparc pixel metrics
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Phil Jensen <philj@csufresno.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_SPARC_PIXEL_H
#define X264_SPARC_PIXEL_H

int x264_pixel_sad_8x8_vis( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_8x16_vis( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_16x8_vis( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_16x16_vis( uint8_t *, int, uint8_t *, int );

#endif
x264-snapshot-20120103-2245-stable/common/sparc/pixel.asm0000644000175000001440000004544411700673342022127 0ustar  videolanusers/*****************************************************************************
 * pixel.asm: sparc pixel metrics
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Phil Jensen <philj@csufresno.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

! VIS optimized SAD for UltraSPARC

.text
.global x264_pixel_sad_8x8_vis
x264_pixel_sad_8x8_vis:
	save %sp, -120, %sp

	fzero %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	std %f12, [%fp-24]
	ld [%fp-20], %i0

	ret
	restore

.global x264_pixel_sad_8x16_vis
x264_pixel_sad_8x16_vis:
	save %sp, -120, %sp

	fzero %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
	add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	std %f12, [%fp-24]
	ld [%fp-20], %i0

	ret
	restore

.global x264_pixel_sad_16x8_vis
x264_pixel_sad_16x8_vis:
	save %sp, -120, %sp

	fzero %f12			! zero out the accumulator used for pdist

	sub %i1, 8, %i1			! reduce stride by 8, since we are moving forward 8 each block
	sub %i3, 8, %i3			! same here, reduce stride by 8

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	std %f12, [%fp-24]
	ld [%fp-20], %i0

	ret
	restore

.global x264_pixel_sad_16x16_vis
x264_pixel_sad_16x16_vis:
	save %sp, -120, %sp

	fzero %f12			! zero out the accumulator used for pdist

	sub %i1, 8, %i1			! reduce stride by 8, since we are moving forward 8 each block
	sub %i3, 8, %i3			! same here, reduce stride by 8

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, 8, %i0
	add %i2, 8, %i2
	pdist %f4, %f10, %f12

	alignaddr %i0, %g0, %l0	
	ldd [%l0], %f0
	ldd [%l0+8], %f2
	faligndata %f0, %f2, %f4

	alignaddr %i2, %g0, %l2
	ldd [%l2], %f6
	ldd [%l2+8], %f8
	faligndata %f6, %f8, %f10

	add %i0, %i1, %i0
        add %i2, %i3, %i2
	pdist %f4, %f10, %f12

	std %f12, [%fp-24]
	ld [%fp-20], %i0

	ret
	restore
x264-snapshot-20120103-2245-stable/common/set.h0000644000175000001440000001376511700673342020141 0ustar  videolanusers/*****************************************************************************
 * set.h: quantization init
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_SET_H
#define X264_SET_H

enum profile_e
{
    PROFILE_BASELINE = 66,
    PROFILE_MAIN     = 77,
    PROFILE_EXTENDED = 88,
    PROFILE_HIGH    = 100,
    PROFILE_HIGH10  = 110,
    PROFILE_HIGH422 = 122,
    PROFILE_HIGH444_PREDICTIVE = 244,
};

enum chroma_format_e
{
    CHROMA_400 = 0,
    CHROMA_420 = 1,
    CHROMA_422 = 2,
    CHROMA_444 = 3,
};

enum cqm4_e
{
    CQM_4IY = 0,
    CQM_4PY = 1,
    CQM_4IC = 2,
    CQM_4PC = 3
};
enum cqm8_e
{
    CQM_8IY = 0,
    CQM_8PY = 1,
    CQM_8IC = 2,
    CQM_8PC = 3,
};

typedef struct
{
    int i_id;

    int i_profile_idc;
    int i_level_idc;

    int b_constraint_set0;
    int b_constraint_set1;
    int b_constraint_set2;
    int b_constraint_set3;

    int i_log2_max_frame_num;

    int i_poc_type;
    /* poc 0 */
    int i_log2_max_poc_lsb;

    int i_num_ref_frames;
    int b_gaps_in_frame_num_value_allowed;
    int i_mb_width;
    int i_mb_height;
    int b_frame_mbs_only;
    int b_mb_adaptive_frame_field;
    int b_direct8x8_inference;

    int b_crop;
    struct
    {
        int i_left;
        int i_right;
        int i_top;
        int i_bottom;
    } crop;

    int b_vui;
    struct
    {
        int b_aspect_ratio_info_present;
        int i_sar_width;
        int i_sar_height;

        int b_overscan_info_present;
        int b_overscan_info;

        int b_signal_type_present;
        int i_vidformat;
        int b_fullrange;
        int b_color_description_present;
        int i_colorprim;
        int i_transfer;
        int i_colmatrix;

        int b_chroma_loc_info_present;
        int i_chroma_loc_top;
        int i_chroma_loc_bottom;

        int b_timing_info_present;
        uint32_t i_num_units_in_tick;
        uint32_t i_time_scale;
        int b_fixed_frame_rate;

        int b_nal_hrd_parameters_present;
        int b_vcl_hrd_parameters_present;

        struct
        {
            int i_cpb_cnt;
            int i_bit_rate_scale;
            int i_cpb_size_scale;
            int i_bit_rate_value;
            int i_cpb_size_value;
            int i_bit_rate_unscaled;
            int i_cpb_size_unscaled;
            int b_cbr_hrd;

            int i_initial_cpb_removal_delay_length;
            int i_cpb_removal_delay_length;
            int i_dpb_output_delay_length;
            int i_time_offset_length;
        } hrd;

        int b_pic_struct_present;
        int b_bitstream_restriction;
        int b_motion_vectors_over_pic_boundaries;
        int i_max_bytes_per_pic_denom;
        int i_max_bits_per_mb_denom;
        int i_log2_max_mv_length_horizontal;
        int i_log2_max_mv_length_vertical;
        int i_num_reorder_frames;
        int i_max_dec_frame_buffering;

        /* FIXME to complete */
    } vui;

    int b_qpprime_y_zero_transform_bypass;
    int i_chroma_format_idc;

} x264_sps_t;

typedef struct
{
    int i_id;
    int i_sps_id;

    int b_cabac;

    int b_pic_order;
    int i_num_slice_groups;

    int i_num_ref_idx_l0_default_active;
    int i_num_ref_idx_l1_default_active;

    int b_weighted_pred;
    int b_weighted_bipred;

    int i_pic_init_qp;
    int i_pic_init_qs;

    int i_chroma_qp_index_offset;

    int b_deblocking_filter_control;
    int b_constrained_intra_pred;
    int b_redundant_pic_cnt;

    int b_transform_8x8_mode;

    int i_cqm_preset;
    const uint8_t *scaling_list[8]; /* could be 12, but we don't allow separate Cb/Cr lists */

} x264_pps_t;

/* default quant matrices */
static const uint8_t x264_cqm_jvt4i[16] =
{
      6,13,20,28,
     13,20,28,32,
     20,28,32,37,
     28,32,37,42
};
static const uint8_t x264_cqm_jvt4p[16] =
{
    10,14,20,24,
    14,20,24,27,
    20,24,27,30,
    24,27,30,34
};
static const uint8_t x264_cqm_jvt8i[64] =
{
     6,10,13,16,18,23,25,27,
    10,11,16,18,23,25,27,29,
    13,16,18,23,25,27,29,31,
    16,18,23,25,27,29,31,33,
    18,23,25,27,29,31,33,36,
    23,25,27,29,31,33,36,38,
    25,27,29,31,33,36,38,40,
    27,29,31,33,36,38,40,42
};
static const uint8_t x264_cqm_jvt8p[64] =
{
     9,13,15,17,19,21,22,24,
    13,13,17,19,21,22,24,25,
    15,17,19,21,22,24,25,27,
    17,19,21,22,24,25,27,28,
    19,21,22,24,25,27,28,30,
    21,22,24,25,27,28,30,32,
    22,24,25,27,28,30,32,33,
    24,25,27,28,30,32,33,35
};
static const uint8_t x264_cqm_flat16[64] =
{
    16,16,16,16,16,16,16,16,
    16,16,16,16,16,16,16,16,
    16,16,16,16,16,16,16,16,
    16,16,16,16,16,16,16,16,
    16,16,16,16,16,16,16,16,
    16,16,16,16,16,16,16,16,
    16,16,16,16,16,16,16,16,
    16,16,16,16,16,16,16,16
};
static const uint8_t * const x264_cqm_jvt[8] =
{
    x264_cqm_jvt4i, x264_cqm_jvt4p,
    x264_cqm_jvt4i, x264_cqm_jvt4p,
    x264_cqm_jvt8i, x264_cqm_jvt8p,
    x264_cqm_jvt8i, x264_cqm_jvt8p
};

int  x264_cqm_init( x264_t *h );
void x264_cqm_delete( x264_t *h );
int  x264_cqm_parse_file( x264_t *h, const char *filename );

#endif
x264-snapshot-20120103-2245-stable/common/rectangle.h0000644000175000001440000001302311700673342021275 0ustar  videolanusers/*****************************************************************************
 * rectangle.h: rectangle filling
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

/* This function should only be called with constant w / h / s arguments! */
static ALWAYS_INLINE void x264_macroblock_cache_rect( void *dst, int w, int h, int s, uint32_t v )
{
    uint8_t *d = dst;
    uint16_t v2 = s == 2 ? v : v * 0x101;
    uint32_t v4 = s == 4 ? v : s == 2 ? v * 0x10001 : v * 0x1010101;
    uint64_t v8 = v4 + ((uint64_t)v4 << 32);
    s *= 8;

    if( w == 2 )
    {
        M16( d+s*0 ) = v2;
        if( h == 1 ) return;
        M16( d+s*1 ) = v2;
        if( h == 2 ) return;
        M16( d+s*2 ) = v2;
        M16( d+s*3 ) = v2;
    }
    else if( w == 4 )
    {
        M32( d+s*0 ) = v4;
        if( h == 1 ) return;
        M32( d+s*1 ) = v4;
        if( h == 2 ) return;
        M32( d+s*2 ) = v4;
        M32( d+s*3 ) = v4;
    }
    else if( w == 8 )
    {
        if( WORD_SIZE == 8 )
        {
            M64( d+s*0 ) = v8;
            if( h == 1 ) return;
            M64( d+s*1 ) = v8;
            if( h == 2 ) return;
            M64( d+s*2 ) = v8;
            M64( d+s*3 ) = v8;
        }
        else
        {
            M32( d+s*0+0 ) = v4;
            M32( d+s*0+4 ) = v4;
            if( h == 1 ) return;
            M32( d+s*1+0 ) = v4;
            M32( d+s*1+4 ) = v4;
            if( h == 2 ) return;
            M32( d+s*2+0 ) = v4;
            M32( d+s*2+4 ) = v4;
            M32( d+s*3+0 ) = v4;
            M32( d+s*3+4 ) = v4;
        }
    }
    else if( w == 16 )
    {
        /* height 1, width 16 doesn't occur */
        assert( h != 1 );
#if HAVE_VECTOREXT && defined(__SSE__)
        v4si v16 = {v,v,v,v};

        M128( d+s*0+0 ) = (__m128)v16;
        M128( d+s*1+0 ) = (__m128)v16;
        if( h == 2 ) return;
        M128( d+s*2+0 ) = (__m128)v16;
        M128( d+s*3+0 ) = (__m128)v16;
#else
        if( WORD_SIZE == 8 )
        {
            do
            {
                M64( d+s*0+0 ) = v8;
                M64( d+s*0+8 ) = v8;
                M64( d+s*1+0 ) = v8;
                M64( d+s*1+8 ) = v8;
                h -= 2;
                d += s*2;
            } while( h );
        }
        else
        {
            do
            {
                M32( d+ 0 ) = v4;
                M32( d+ 4 ) = v4;
                M32( d+ 8 ) = v4;
                M32( d+12 ) = v4;
                d += s;
            } while( --h );
        }
#endif
    }
    else
        assert(0);
}

extern void (*x264_cache_mv_func_table[10])(void *, uint32_t);\
extern void (*x264_cache_mvd_func_table[10])(void *, uint32_t);\
extern void (*x264_cache_ref_func_table[10])(void *, uint32_t);\

#define x264_macroblock_cache_mv_ptr( a, x, y, w, h, l, mv ) x264_macroblock_cache_mv( a, x, y, w, h, l, M32( mv ) )
static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
{
    void *mv_cache = &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y];
    if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) )
        x264_cache_mv_func_table[width + (height<<1)-3]( mv_cache, mv );
    else
        x264_macroblock_cache_rect( mv_cache, width*4, height, 4, mv );
}
static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mvd )
{
    void *mvd_cache = &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y];
    if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) )
        x264_cache_mvd_func_table[width + (height<<1)-3]( mvd_cache, mvd );
    else
        x264_macroblock_cache_rect( mvd_cache, width*2, height, 2, mvd );
}
static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
{
    void *ref_cache = &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y];
    if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) )
        x264_cache_ref_func_table[width + (height<<1)-3]( ref_cache, ref );
    else
        x264_macroblock_cache_rect( ref_cache, width, height, 1, ref );
}
static ALWAYS_INLINE void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip )
{
    x264_macroblock_cache_rect( &h->mb.cache.skip[X264_SCAN8_0+x+8*y], width, height, 1, b_skip );
}
static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x, int y, int i_mode )
{
    x264_macroblock_cache_rect( &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y], 2, 2, 1, i_mode );
}
x264-snapshot-20120103-2245-stable/common/rectangle.c0000644000175000001440000000412711700673342021275 0ustar  videolanusers/*****************************************************************************
 * rectangle.c: rectangle filling
 *****************************************************************************
 * Copyright (C) 2010-2011 x264 project
 *
 * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

#define CACHE_FUNC(name,size,width,height)\
static void x264_macroblock_cache_##name##_##width##_##height( void *target, uint32_t val )\
{\
    x264_macroblock_cache_rect( target, width*size, height, size, val );\
}

#define CACHE_FUNCS(name,size)\
CACHE_FUNC(name,size,4,4)\
CACHE_FUNC(name,size,2,4)\
CACHE_FUNC(name,size,4,2)\
CACHE_FUNC(name,size,2,2)\
CACHE_FUNC(name,size,2,1)\
CACHE_FUNC(name,size,1,2)\
CACHE_FUNC(name,size,1,1)\
void (*x264_cache_##name##_func_table[10])(void *, uint32_t) =\
{\
    x264_macroblock_cache_##name##_1_1,\
    x264_macroblock_cache_##name##_2_1,\
    x264_macroblock_cache_##name##_1_2,\
    x264_macroblock_cache_##name##_2_2,\
    NULL,\
    x264_macroblock_cache_##name##_4_2,\
    NULL,\
    x264_macroblock_cache_##name##_2_4,\
    NULL,\
    x264_macroblock_cache_##name##_4_4\
};\

CACHE_FUNCS(mv, 4)
CACHE_FUNCS(mvd, 2)
CACHE_FUNCS(ref, 1)
x264-snapshot-20120103-2245-stable/common/quant.h0000644000175000001440000000545511700673342020473 0ustar  videolanusers/*****************************************************************************
 * quant.h: quantization and level-run
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_QUANT_H
#define X264_QUANT_H

typedef struct
{
    int (*quant_8x8)( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
    int (*quant_4x4)( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
    int (*quant_4x4_dc)( dctcoef dct[16], int mf, int bias );
    int (*quant_2x2_dc)( dctcoef dct[4], int mf, int bias );

    void (*dequant_8x8)( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
    void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
    void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );

    void (*idct_dequant_2x4_dc)( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
    void (*idct_dequant_2x4_dconly)( dctcoef dct[8], int dequant_mf[6][16], int i_qp );

    int (*optimize_chroma_2x2_dc)( dctcoef dct[4], int dequant_mf );
    int (*optimize_chroma_2x4_dc)( dctcoef dct[8], int dequant_mf );

    void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );

    int (*decimate_score15)( dctcoef *dct );
    int (*decimate_score16)( dctcoef *dct );
    int (*decimate_score64)( dctcoef *dct );
    int (*coeff_last[14])( dctcoef *dct );
    int (*coeff_last4)( dctcoef *dct );
    int (*coeff_last8)( dctcoef *dct );
    int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel );
    int (*coeff_level_run4)( dctcoef *dct, x264_run_level_t *runlevel );
    int (*coeff_level_run8)( dctcoef *dct, x264_run_level_t *runlevel );
} x264_quant_function_t;

void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );

#endif
x264-snapshot-20120103-2245-stable/common/quant.c0000644000175000001440000005635111700673342020467 0ustar  videolanusers/*****************************************************************************
 * quant.c: quantization and level-run
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Christian Heine <sennindemokrit@gmx.net>
 *          Henrik Gramner <hengar-6@student.ltu.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

#if HAVE_MMX
#include "x86/quant.h"
#endif
#if ARCH_PPC
#   include "ppc/quant.h"
#endif
#if ARCH_ARM
#   include "arm/quant.h"
#endif

#define QUANT_ONE( coef, mf, f ) \
{ \
    if( (coef) > 0 ) \
        (coef) = (f + (coef)) * (mf) >> 16; \
    else \
        (coef) = - ((f - (coef)) * (mf) >> 16); \
    nz |= (coef); \
}

static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] )
{
    int nz = 0;
    for( int i = 0; i < 64; i++ )
        QUANT_ONE( dct[i], mf[i], bias[i] );
    return !!nz;
}

static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
{
    int nz = 0;
    for( int i = 0; i < 16; i++ )
        QUANT_ONE( dct[i], mf[i], bias[i] );
    return !!nz;
}

static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
{
    int nz = 0;
    for( int i = 0; i < 16; i++ )
        QUANT_ONE( dct[i], mf, bias );
    return !!nz;
}

static int quant_2x2_dc( dctcoef dct[4], int mf, int bias )
{
    int nz = 0;
    QUANT_ONE( dct[0], mf, bias );
    QUANT_ONE( dct[1], mf, bias );
    QUANT_ONE( dct[2], mf, bias );
    QUANT_ONE( dct[3], mf, bias );
    return !!nz;
}

#define DEQUANT_SHL( x ) \
    dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits

#define DEQUANT_SHR( x ) \
    dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)

static void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
{
    const int i_mf = i_qp%6;
    const int i_qbits = i_qp/6 - 4;

    if( i_qbits >= 0 )
    {
        for( int i = 0; i < 16; i++ )
            DEQUANT_SHL( i );
    }
    else
    {
        const int f = 1 << (-i_qbits-1);
        for( int i = 0; i < 16; i++ )
            DEQUANT_SHR( i );
    }
}

static void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp )
{
    const int i_mf = i_qp%6;
    const int i_qbits = i_qp/6 - 6;

    if( i_qbits >= 0 )
    {
        for( int i = 0; i < 64; i++ )
            DEQUANT_SHL( i );
    }
    else
    {
        const int f = 1 << (-i_qbits-1);
        for( int i = 0; i < 64; i++ )
            DEQUANT_SHR( i );
    }
}

static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
{
    const int i_qbits = i_qp/6 - 6;

    if( i_qbits >= 0 )
    {
        const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
        for( int i = 0; i < 16; i++ )
            dct[i] *= i_dmf;
    }
    else
    {
        const int i_dmf = dequant_mf[i_qp%6][0];
        const int f = 1 << (-i_qbits-1);
        for( int i = 0; i < 16; i++ )
            dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
    }
}

#define IDCT_DEQUANT_2X4_START \
    int a0 = dct[0] + dct[1]; \
    int a1 = dct[2] + dct[3]; \
    int a2 = dct[4] + dct[5]; \
    int a3 = dct[6] + dct[7]; \
    int a4 = dct[0] - dct[1]; \
    int a5 = dct[2] - dct[3]; \
    int a6 = dct[4] - dct[5]; \
    int a7 = dct[6] - dct[7]; \
    int b0 = a0 + a1; \
    int b1 = a2 + a3; \
    int b2 = a4 + a5; \
    int b3 = a6 + a7; \
    int b4 = a0 - a1; \
    int b5 = a2 - a3; \
    int b6 = a4 - a5; \
    int b7 = a6 - a7;

static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
{
    IDCT_DEQUANT_2X4_START
    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
    dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
    dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
    dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
    dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
    dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
    dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
    dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
    dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
}

static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
{
    IDCT_DEQUANT_2X4_START
    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
    dct[0] = ((b0 + b1) * dmf + 32) >> 6;
    dct[1] = ((b2 + b3) * dmf + 32) >> 6;
    dct[2] = ((b0 - b1) * dmf + 32) >> 6;
    dct[3] = ((b2 - b3) * dmf + 32) >> 6;
    dct[4] = ((b4 - b5) * dmf + 32) >> 6;
    dct[5] = ((b6 - b7) * dmf + 32) >> 6;
    dct[6] = ((b4 + b5) * dmf + 32) >> 6;
    dct[7] = ((b6 + b7) * dmf + 32) >> 6;
}

static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
{
    IDCT_DEQUANT_2X4_START
    out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
    out[1] = ((b2 + b3) * dmf + 2080) >> 6;
    out[2] = ((b0 - b1) * dmf + 2080) >> 6;
    out[3] = ((b2 - b3) * dmf + 2080) >> 6;
    out[4] = ((b4 - b5) * dmf + 2080) >> 6;
    out[5] = ((b6 - b7) * dmf + 2080) >> 6;
    out[6] = ((b4 + b5) * dmf + 2080) >> 6;
    out[7] = ((b6 + b7) * dmf + 2080) >> 6;
}
#undef IDCT_DEQUANT_2X4_START

static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
{
    int d0 = dct[0] + dct[1];
    int d1 = dct[2] + dct[3];
    int d2 = dct[0] - dct[1];
    int d3 = dct[2] - dct[3];
    out[0] = ((d0 + d1) * dmf >> 5) + 32;
    out[1] = ((d0 - d1) * dmf >> 5) + 32;
    out[2] = ((d2 + d3) * dmf >> 5) + 32;
    out[3] = ((d2 - d3) * dmf >> 5) + 32;
}

static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
{
    dctcoef out[8];

    if( chroma422 )
        optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
    else
        optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );

    int sum = 0;
    for( int i = 0; i < (chroma422?8:4); i++ )
        sum |= ref[i] ^ out[i];
    return sum >> 6;
}

static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
{
    /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
    dctcoef dct_orig[8];
    int coeff, nz;

    if( chroma422 )
        optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
    else
        optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );

    /* If the DC coefficients already round to zero, terminate early. */
    int sum = 0;
    for( int i = 0; i < (chroma422?8:4); i++ )
        sum |= dct_orig[i];
    if( !(sum >> 6) )
        return 0;

    /* Start with the highest frequency coefficient... is this the best option? */
    for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
    {
        int level = dct[coeff];
        int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */

        while( level )
        {
            dct[coeff] = level - sign;
            if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
            {
                nz = 1;
                dct[coeff] = level;
                break;
            }
            level -= sign;
        }
    }

    return nz;
}

static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
{
    return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
}

static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
{
    return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
}

static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
{
    for( int i = 0; i < size; i++ )
    {
        int level = dct[i];
        int sign = level>>31;
        level = (level+sign)^sign;
        sum[i] += level;
        level -= offset[i];
        dct[i] = level<0 ? 0 : (level^sign)-sign;
    }
}

/* (ref: JVT-B118)
 * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
 * to 0 (low score means set it to null)
 * Used in inter macroblock (luma and chroma)
 *  luma: for a 8x8 block: if score < 4 -> null
 *        for the complete mb: if score < 6 -> null
 *  chroma: for the complete mb: if score < 7 -> null
 */

const uint8_t x264_decimate_table4[16] =
{
    3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0
};
const uint8_t x264_decimate_table8[64] =
{
    3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
    1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};

static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max )
{
    const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
    int i_score = 0;
    int idx = i_max - 1;

    while( idx >= 0 && dct[idx] == 0 )
        idx--;
    while( idx >= 0 )
    {
        int i_run;

        if( (unsigned)(dct[idx--] + 1) > 2 )
            return 9;

        i_run = 0;
        while( idx >= 0 && dct[idx] == 0 )
        {
            idx--;
            i_run++;
        }
        i_score += ds_table[i_run];
    }

    return i_score;
}

static int x264_decimate_score15( dctcoef *dct )
{
    return x264_decimate_score_internal( dct+1, 15 );
}
static int x264_decimate_score16( dctcoef *dct )
{
    return x264_decimate_score_internal( dct, 16 );
}
static int x264_decimate_score64( dctcoef *dct )
{
    return x264_decimate_score_internal( dct, 64 );
}

#define last(num)\
static int x264_coeff_last##num( dctcoef *l )\
{\
    int i_last = num-1;\
    while( i_last >= 0 && l[i_last] == 0 )\
        i_last--;\
    return i_last;\
}

last(4)
last(8)
last(15)
last(16)
last(64)

#define level_run(num)\
static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
{\
    int i_last = runlevel->last = x264_coeff_last##num(dct);\
    int i_total = 0;\
    do\
    {\
        int r = 0;\
        runlevel->level[i_total] = dct[i_last];\
        while( --i_last >= 0 && dct[i_last] == 0 )\
            r++;\
        runlevel->run[i_total++] = r;\
    } while( i_last >= 0 );\
    return i_total;\
}

level_run(4)
level_run(8)
level_run(15)
level_run(16)

void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
    pf->quant_8x8 = quant_8x8;
    pf->quant_4x4 = quant_4x4;
    pf->quant_4x4_dc = quant_4x4_dc;
    pf->quant_2x2_dc = quant_2x2_dc;

    pf->dequant_4x4 = dequant_4x4;
    pf->dequant_4x4_dc = dequant_4x4_dc;
    pf->dequant_8x8 = dequant_8x8;

    pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
    pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;

    pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
    pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;

    pf->denoise_dct = x264_denoise_dct;
    pf->decimate_score15 = x264_decimate_score15;
    pf->decimate_score16 = x264_decimate_score16;
    pf->decimate_score64 = x264_decimate_score64;

    pf->coeff_last4 = x264_coeff_last4;
    pf->coeff_last8 = x264_coeff_last8;
    pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
    pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
    pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
    pf->coeff_level_run4 = x264_coeff_level_run4;
    pf->coeff_level_run8 = x264_coeff_level_run8;
    pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
    pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;

#if HIGH_BIT_DEPTH
#if HAVE_MMX
    if( cpu&X264_CPU_MMX2 )
    {
#if ARCH_X86
        pf->denoise_dct = x264_denoise_dct_mmx;
        pf->decimate_score15 = x264_decimate_score15_mmx2;
        pf->decimate_score16 = x264_decimate_score16_mmx2;
        if( cpu&X264_CPU_SLOW_CTZ )
        {
            pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
            pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
        }
        pf->decimate_score64 = x264_decimate_score64_mmx2;
        pf->coeff_last8 = x264_coeff_last8_mmx2;
        pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
        pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
        pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
        pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
        pf->coeff_last4 = x264_coeff_last4_mmx2;
        pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
        if( cpu&X264_CPU_LZCNT )
            pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
    }
    if( cpu&X264_CPU_SSE2 )
    {
        pf->quant_4x4 = x264_quant_4x4_sse2;
        pf->quant_8x8 = x264_quant_8x8_sse2;
        pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
        pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
        pf->dequant_4x4 = x264_dequant_4x4_sse2;
        pf->dequant_8x8 = x264_dequant_8x8_sse2;
        pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
        pf->denoise_dct = x264_denoise_dct_sse2;
        pf->decimate_score15 = x264_decimate_score15_sse2;
        pf->decimate_score16 = x264_decimate_score16_sse2;
        pf->decimate_score64 = x264_decimate_score64_sse2;
        if( cpu&X264_CPU_SLOW_CTZ )
        {
            pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
            pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
        }
        pf->coeff_last8 = x264_coeff_last8_sse2;
        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
        pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
        if( cpu&X264_CPU_LZCNT )
        {
            pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
            pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
            pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
            pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
            pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
        }
    }
    if( cpu&X264_CPU_SSSE3 )
    {
        pf->quant_4x4 = x264_quant_4x4_ssse3;
        pf->quant_8x8 = x264_quant_8x8_ssse3;
        pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
        pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
        pf->denoise_dct = x264_denoise_dct_ssse3;
        pf->decimate_score15 = x264_decimate_score15_ssse3;
        pf->decimate_score16 = x264_decimate_score16_ssse3;
        if( cpu&X264_CPU_SLOW_CTZ )
        {
            pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
            pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
        }
        pf->decimate_score64 = x264_decimate_score64_ssse3;
    }
    if( cpu&X264_CPU_SSE4 )
    {
        pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
        pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
        pf->quant_4x4 = x264_quant_4x4_sse4;
        pf->quant_8x8 = x264_quant_8x8_sse4;
    }
    if( cpu&X264_CPU_AVX )
    {
        pf->denoise_dct = x264_denoise_dct_avx;
    }
    if( cpu&X264_CPU_XOP )
    {
        pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
        if( h->param.i_cqm_preset != X264_CQM_FLAT )
        {
            pf->dequant_4x4 = x264_dequant_4x4_xop;
            pf->dequant_8x8 = x264_dequant_8x8_xop;
        }
    }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
    if( cpu&X264_CPU_MMX )
    {
#if ARCH_X86
        pf->quant_4x4 = x264_quant_4x4_mmx;
        pf->quant_8x8 = x264_quant_8x8_mmx;
        pf->dequant_4x4 = x264_dequant_4x4_mmx;
        pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
        pf->dequant_8x8 = x264_dequant_8x8_mmx;
        if( h->param.i_cqm_preset == X264_CQM_FLAT )
        {
            pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
            pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
        }
        pf->denoise_dct = x264_denoise_dct_mmx;
#endif
    }

    if( cpu&X264_CPU_MMX2 )
    {
        pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
#if ARCH_X86
        pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
        pf->decimate_score15 = x264_decimate_score15_mmx2;
        pf->decimate_score16 = x264_decimate_score16_mmx2;
        if( cpu&X264_CPU_SLOW_CTZ )
        {
            pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
            pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
        }
        pf->decimate_score64 = x264_decimate_score64_mmx2;
        pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
        pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
        pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
        pf->coeff_last4 = x264_coeff_last4_mmx2;
        pf->coeff_last8 = x264_coeff_last8_mmx2;
        pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
        pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
        if( cpu&X264_CPU_LZCNT )
        {
            pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
            pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
            pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
            pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
        }
    }

    if( cpu&X264_CPU_SSE2 )
    {
        pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
        pf->quant_4x4 = x264_quant_4x4_sse2;
        pf->quant_8x8 = x264_quant_8x8_sse2;
        pf->dequant_4x4 = x264_dequant_4x4_sse2;
        pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
        pf->dequant_8x8 = x264_dequant_8x8_sse2;
        if( h->param.i_cqm_preset == X264_CQM_FLAT )
        {
            pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
            pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
        }
        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
        pf->denoise_dct = x264_denoise_dct_sse2;
        pf->decimate_score15 = x264_decimate_score15_sse2;
        pf->decimate_score16 = x264_decimate_score16_sse2;
        pf->decimate_score64 = x264_decimate_score64_sse2;
        if( cpu&X264_CPU_SLOW_CTZ )
        {
            pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
            pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
        }
        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
        if( cpu&X264_CPU_LZCNT )
        {
            pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
            pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
        }
    }

    if( cpu&X264_CPU_SSSE3 )
    {
        pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
        pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
        pf->quant_4x4 = x264_quant_4x4_ssse3;
        pf->quant_8x8 = x264_quant_8x8_ssse3;
        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
        pf->denoise_dct = x264_denoise_dct_ssse3;
        pf->decimate_score15 = x264_decimate_score15_ssse3;
        pf->decimate_score16 = x264_decimate_score16_ssse3;
        if( cpu&X264_CPU_SLOW_CTZ )
        {
            pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
            pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
        }
        pf->decimate_score64 = x264_decimate_score64_ssse3;
    }

    if( cpu&X264_CPU_SSE4 )
    {
        pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
        pf->quant_4x4 = x264_quant_4x4_sse4;
        pf->quant_8x8 = x264_quant_8x8_sse4;
        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
    }

    if( cpu&X264_CPU_AVX )
    {
        pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
        if( h->param.i_cqm_preset != X264_CQM_FLAT )
        {
            pf->dequant_4x4 = x264_dequant_4x4_avx;
            pf->dequant_8x8 = x264_dequant_8x8_avx;
        }
        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
        pf->denoise_dct = x264_denoise_dct_avx;
    }

    if( cpu&X264_CPU_XOP )
    {
        if( h->param.i_cqm_preset != X264_CQM_FLAT )
        {
            pf->dequant_4x4 = x264_dequant_4x4_xop;
            pf->dequant_8x8 = x264_dequant_8x8_xop;
        }
    }
#endif // HAVE_MMX

#if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC ) {
        pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
        pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
        pf->quant_4x4 = x264_quant_4x4_altivec;
        pf->quant_8x8 = x264_quant_8x8_altivec;

        pf->dequant_4x4 = x264_dequant_4x4_altivec;
        pf->dequant_8x8 = x264_dequant_8x8_altivec;
    }
#endif

#if HAVE_ARMV6
    if( cpu&X264_CPU_ARMV6 )
        pf->coeff_last4 = x264_coeff_last4_arm;

    if( cpu&X264_CPU_NEON )
    {
        pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
        pf->quant_4x4      = x264_quant_4x4_neon;
        pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
        pf->quant_8x8      = x264_quant_8x8_neon;
        pf->dequant_4x4    = x264_dequant_4x4_neon;
        pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
        pf->dequant_8x8    = x264_dequant_8x8_neon;
        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
    }
#endif
#endif // HIGH_BIT_DEPTH
    pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
    pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
    pf->coeff_last[DCT_CHROMA_AC]   = pf->coeff_last[DCT_CHROMAU_AC]  =
    pf->coeff_last[DCT_CHROMAV_AC]  = pf->coeff_last[DCT_LUMA_AC];
    pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8];

    pf->coeff_level_run[DCT_LUMA_DC]     = pf->coeff_level_run[DCT_CHROMAU_DC]  = pf->coeff_level_run[DCT_CHROMAV_DC] =
    pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4];
    pf->coeff_level_run[DCT_CHROMA_AC]   = pf->coeff_level_run[DCT_CHROMAU_AC]  =
    pf->coeff_level_run[DCT_CHROMAV_AC]  = pf->coeff_level_run[DCT_LUMA_AC];
}
x264-snapshot-20120103-2245-stable/common/predict.h0000644000175000001440000001076011700673342020770 0ustar  videolanusers/*****************************************************************************
 * predict.h: intra prediction
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_PREDICT_H
#define X264_PREDICT_H

typedef void (*x264_predict_t)( pixel *src );
typedef void (*x264_predict8x8_t)( pixel *src, pixel edge[36] );
typedef void (*x264_predict_8x8_filter_t) ( pixel *src, pixel edge[36], int i_neighbor, int i_filters );

enum intra_chroma_pred_e
{
    I_PRED_CHROMA_DC = 0,
    I_PRED_CHROMA_H  = 1,
    I_PRED_CHROMA_V  = 2,
    I_PRED_CHROMA_P  = 3,

    I_PRED_CHROMA_DC_LEFT = 4,
    I_PRED_CHROMA_DC_TOP  = 5,
    I_PRED_CHROMA_DC_128  = 6
};
static const uint8_t x264_mb_chroma_pred_mode_fix[7] =
{
    I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
    I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
};

enum intra16x16_pred_e
{
    I_PRED_16x16_V  = 0,
    I_PRED_16x16_H  = 1,
    I_PRED_16x16_DC = 2,
    I_PRED_16x16_P  = 3,

    I_PRED_16x16_DC_LEFT = 4,
    I_PRED_16x16_DC_TOP  = 5,
    I_PRED_16x16_DC_128  = 6,
};
static const uint8_t x264_mb_pred_mode16x16_fix[7] =
{
    I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P,
    I_PRED_16x16_DC,I_PRED_16x16_DC,I_PRED_16x16_DC
};

enum intra4x4_pred_e
{
    I_PRED_4x4_V  = 0,
    I_PRED_4x4_H  = 1,
    I_PRED_4x4_DC = 2,
    I_PRED_4x4_DDL= 3,
    I_PRED_4x4_DDR= 4,
    I_PRED_4x4_VR = 5,
    I_PRED_4x4_HD = 6,
    I_PRED_4x4_VL = 7,
    I_PRED_4x4_HU = 8,

    I_PRED_4x4_DC_LEFT = 9,
    I_PRED_4x4_DC_TOP  = 10,
    I_PRED_4x4_DC_128  = 11,
};
static const int8_t x264_mb_pred_mode4x4_fix[13] =
{
    -1,
    I_PRED_4x4_V,   I_PRED_4x4_H,   I_PRED_4x4_DC,
    I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR,
    I_PRED_4x4_HD,  I_PRED_4x4_VL,  I_PRED_4x4_HU,
    I_PRED_4x4_DC,  I_PRED_4x4_DC,  I_PRED_4x4_DC
};
#define x264_mb_pred_mode4x4_fix(t) x264_mb_pred_mode4x4_fix[(t)+1]

/* must use the same numbering as intra4x4_pred_e */
enum intra8x8_pred_e
{
    I_PRED_8x8_V  = 0,
    I_PRED_8x8_H  = 1,
    I_PRED_8x8_DC = 2,
    I_PRED_8x8_DDL= 3,
    I_PRED_8x8_DDR= 4,
    I_PRED_8x8_VR = 5,
    I_PRED_8x8_HD = 6,
    I_PRED_8x8_VL = 7,
    I_PRED_8x8_HU = 8,

    I_PRED_8x8_DC_LEFT = 9,
    I_PRED_8x8_DC_TOP  = 10,
    I_PRED_8x8_DC_128  = 11,
};

void x264_predict_8x8_dc_c  ( pixel *src, pixel edge[36] );
void x264_predict_8x8_h_c   ( pixel *src, pixel edge[36] );
void x264_predict_8x8_v_c   ( pixel *src, pixel edge[36] );
void x264_predict_4x4_dc_c  ( pixel *src );
void x264_predict_4x4_h_c   ( pixel *src );
void x264_predict_4x4_v_c   ( pixel *src );
void x264_predict_16x16_dc_c( pixel *src );
void x264_predict_16x16_h_c ( pixel *src );
void x264_predict_16x16_v_c ( pixel *src );
void x264_predict_16x16_p_c ( pixel *src );
void x264_predict_8x8c_dc_c ( pixel *src );
void x264_predict_8x8c_h_c  ( pixel *src );
void x264_predict_8x8c_v_c  ( pixel *src );
void x264_predict_8x8c_p_c  ( pixel *src );
void x264_predict_8x16c_dc_c( pixel *src );
void x264_predict_8x16c_h_c ( pixel *src );
void x264_predict_8x16c_v_c ( pixel *src );
void x264_predict_8x16c_p_c ( pixel *src );

void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x8c_init  ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x16c_init ( int cpu, x264_predict_t pf[7] );
void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );


#endif
x264-snapshot-20120103-2245-stable/common/predict.c0000644000175000001440000007366011700673342020773 0ustar  videolanusers/*****************************************************************************
 * predict.c: intra prediction
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Henrik Gramner <hengar-6@student.ltu.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

/* predict4x4 are inspired from ffmpeg h264 decoder */


#include "common.h"

#if HAVE_MMX
#   include "x86/predict.h"
#endif
#if ARCH_PPC
#   include "ppc/predict.h"
#endif
#if ARCH_ARM
#   include "arm/predict.h"
#endif

/****************************************************************************
 * 16x16 prediction for intra luma block
 ****************************************************************************/

#define PREDICT_16x16_DC(v)\
    for( int i = 0; i < 16; i++ )\
    {\
        MPIXEL_X4( src+ 0 ) = v;\
        MPIXEL_X4( src+ 4 ) = v;\
        MPIXEL_X4( src+ 8 ) = v;\
        MPIXEL_X4( src+12 ) = v;\
        src += FDEC_STRIDE;\
    }

void x264_predict_16x16_dc_c( pixel *src )
{
    int dc = 0;

    for( int i = 0; i < 16; i++ )
    {
        dc += src[-1 + i * FDEC_STRIDE];
        dc += src[i - FDEC_STRIDE];
    }
    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 );

    PREDICT_16x16_DC( dcsplat );
}
static void x264_predict_16x16_dc_left_c( pixel *src )
{
    int dc = 0;

    for( int i = 0; i < 16; i++ )
        dc += src[-1 + i * FDEC_STRIDE];
    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );

    PREDICT_16x16_DC( dcsplat );
}
static void x264_predict_16x16_dc_top_c( pixel *src )
{
    int dc = 0;

    for( int i = 0; i < 16; i++ )
        dc += src[i - FDEC_STRIDE];
    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );

    PREDICT_16x16_DC( dcsplat );
}
static void x264_predict_16x16_dc_128_c( pixel *src )
{
    PREDICT_16x16_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
}
void x264_predict_16x16_h_c( pixel *src )
{
    for( int i = 0; i < 16; i++ )
    {
        const pixel4 v = PIXEL_SPLAT_X4( src[-1] );
        MPIXEL_X4( src+ 0 ) = v;
        MPIXEL_X4( src+ 4 ) = v;
        MPIXEL_X4( src+ 8 ) = v;
        MPIXEL_X4( src+12 ) = v;
        src += FDEC_STRIDE;
    }
}
void x264_predict_16x16_v_c( pixel *src )
{
    pixel4 v0 = MPIXEL_X4( &src[ 0-FDEC_STRIDE] );
    pixel4 v1 = MPIXEL_X4( &src[ 4-FDEC_STRIDE] );
    pixel4 v2 = MPIXEL_X4( &src[ 8-FDEC_STRIDE] );
    pixel4 v3 = MPIXEL_X4( &src[12-FDEC_STRIDE] );

    for( int i = 0; i < 16; i++ )
    {
        MPIXEL_X4( src+ 0 ) = v0;
        MPIXEL_X4( src+ 4 ) = v1;
        MPIXEL_X4( src+ 8 ) = v2;
        MPIXEL_X4( src+12 ) = v3;
        src += FDEC_STRIDE;
    }
}
void x264_predict_16x16_p_c( pixel *src )
{
    int H = 0, V = 0;

    /* calculate H and V */
    for( int i = 0; i <= 7; i++ )
    {
        H += ( i + 1 ) * ( src[ 8 + i - FDEC_STRIDE ] - src[6 -i -FDEC_STRIDE] );
        V += ( i + 1 ) * ( src[-1 + (8+i)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
    }

    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[15 - FDEC_STRIDE] );
    int b = ( 5 * H + 32 ) >> 6;
    int c = ( 5 * V + 32 ) >> 6;

    int i00 = a - b * 7 - c * 7 + 16;

    for( int y = 0; y < 16; y++ )
    {
        int pix = i00;
        for( int x = 0; x < 16; x++ )
        {
            src[x] = x264_clip_pixel( pix>>5 );
            pix += b;
        }
        src += FDEC_STRIDE;
        i00 += c;
    }
}


/****************************************************************************
 * 8x8 prediction for intra chroma block (4:2:0)
 ****************************************************************************/

static void x264_predict_8x8c_dc_128_c( pixel *src )
{
    for( int y = 0; y < 8; y++ )
    {
        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
        src += FDEC_STRIDE;
    }
}
static void x264_predict_8x8c_dc_left_c( pixel *src )
{
    int dc0 = 0, dc1 = 0;

    for( int y = 0; y < 4; y++ )
    {
        dc0 += src[y * FDEC_STRIDE     - 1];
        dc1 += src[(y+4) * FDEC_STRIDE - 1];
    }
    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );

    for( int y = 0; y < 4; y++ )
    {
        MPIXEL_X4( src+0 ) = dc0splat;
        MPIXEL_X4( src+4 ) = dc0splat;
        src += FDEC_STRIDE;
    }
    for( int y = 0; y < 4; y++ )
    {
        MPIXEL_X4( src+0 ) = dc1splat;
        MPIXEL_X4( src+4 ) = dc1splat;
        src += FDEC_STRIDE;
    }

}
static void x264_predict_8x8c_dc_top_c( pixel *src )
{
    int dc0 = 0, dc1 = 0;

    for( int x = 0; x < 4; x++ )
    {
        dc0 += src[x     - FDEC_STRIDE];
        dc1 += src[x + 4 - FDEC_STRIDE];
    }
    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );

    for( int y = 0; y < 8; y++ )
    {
        MPIXEL_X4( src+0 ) = dc0splat;
        MPIXEL_X4( src+4 ) = dc1splat;
        src += FDEC_STRIDE;
    }
}
void x264_predict_8x8c_dc_c( pixel *src )
{
    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;

    /*
          s0 s1
       s2
       s3
    */
    for( int i = 0; i < 4; i++ )
    {
        s0 += src[i - FDEC_STRIDE];
        s1 += src[i + 4 - FDEC_STRIDE];
        s2 += src[-1 + i * FDEC_STRIDE];
        s3 += src[-1 + (i+4)*FDEC_STRIDE];
    }
    /*
       dc0 dc1
       dc2 dc3
     */
    pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 );
    pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 );
    pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 );
    pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 );

    for( int y = 0; y < 4; y++ )
    {
        MPIXEL_X4( src+0 ) = dc0;
        MPIXEL_X4( src+4 ) = dc1;
        src += FDEC_STRIDE;
    }

    for( int y = 0; y < 4; y++ )
    {
        MPIXEL_X4( src+0 ) = dc2;
        MPIXEL_X4( src+4 ) = dc3;
        src += FDEC_STRIDE;
    }
}
void x264_predict_8x8c_h_c( pixel *src )
{
    for( int i = 0; i < 8; i++ )
    {
        pixel4 v = PIXEL_SPLAT_X4( src[-1] );
        MPIXEL_X4( src+0 ) = v;
        MPIXEL_X4( src+4 ) = v;
        src += FDEC_STRIDE;
    }
}
void x264_predict_8x8c_v_c( pixel *src )
{
    pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE );
    pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE );

    for( int i = 0; i < 8; i++ )
    {
        MPIXEL_X4( src+0 ) = v0;
        MPIXEL_X4( src+4 ) = v1;
        src += FDEC_STRIDE;
    }
}
void x264_predict_8x8c_p_c( pixel *src )
{
    int H = 0, V = 0;

    for( int i = 0; i < 4; i++ )
    {
        H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] );
        V += ( i + 1 ) * ( src[-1 +(i+4)*FDEC_STRIDE] - src[-1+(2-i)*FDEC_STRIDE] );
    }

    int a = 16 * ( src[-1+7*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
    int b = ( 17 * H + 16 ) >> 5;
    int c = ( 17 * V + 16 ) >> 5;
    int i00 = a -3*b -3*c + 16;

    for( int y = 0; y < 8; y++ )
    {
        int pix = i00;
        for( int x = 0; x < 8; x++ )
        {
            src[x] = x264_clip_pixel( pix>>5 );
            pix += b;
        }
        src += FDEC_STRIDE;
        i00 += c;
    }
}

/****************************************************************************
 * 8x16 prediction for intra chroma block (4:2:2)
 ****************************************************************************/

static void x264_predict_8x16c_dc_128_c( pixel *src )
{
    for( int y = 0; y < 16; y++ )
    {
        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
        src += FDEC_STRIDE;
    }
}
static void x264_predict_8x16c_dc_left_c( pixel *src )
{
    for( int i = 0; i < 4; i++ )
    {
        int dc = 0;

        for( int y = 0; y < 4; y++ )
            dc += src[y*FDEC_STRIDE - 1];

        pixel4 dcsplat = PIXEL_SPLAT_X4( (dc + 2) >> 2 );

        for( int y = 0; y < 4; y++ )
        {
            MPIXEL_X4( src+0 ) = dcsplat;
            MPIXEL_X4( src+4 ) = dcsplat;
            src += FDEC_STRIDE;
        }
    }
}
static void x264_predict_8x16c_dc_top_c( pixel *src )
{
    int dc0 = 0, dc1 = 0;

    for(int  x = 0; x < 4; x++ )
    {
        dc0 += src[x     - FDEC_STRIDE];
        dc1 += src[x + 4 - FDEC_STRIDE];
    }
    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );

    for( int y = 0; y < 16; y++ )
    {
        MPIXEL_X4( src+0 ) = dc0splat;
        MPIXEL_X4( src+4 ) = dc1splat;
        src += FDEC_STRIDE;
    }
}
void x264_predict_8x16c_dc_c( pixel *src )
{
    int s0 = 0, s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0;

    /*
          s0 s1
       s2
       s3
       s4
       s5
    */
    for( int i = 0; i < 4; i++ )
    {
        s0 += src[i+0 - FDEC_STRIDE];
        s1 += src[i+4 - FDEC_STRIDE];
        s2 += src[-1 + (i+0)  * FDEC_STRIDE];
        s3 += src[-1 + (i+4)  * FDEC_STRIDE];
        s4 += src[-1 + (i+8)  * FDEC_STRIDE];
        s5 += src[-1 + (i+12) * FDEC_STRIDE];
    }
    /*
       dc0 dc1
       dc2 dc3
       dc4 dc5
       dc6 dc7
    */
    pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 );
    pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 );
    pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 );
    pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 );
    pixel4 dc4 = PIXEL_SPLAT_X4( ( s4 + 2 ) >> 2 );
    pixel4 dc5 = PIXEL_SPLAT_X4( ( s1 + s4 + 4 ) >> 3 );
    pixel4 dc6 = PIXEL_SPLAT_X4( ( s5 + 2 ) >> 2 );
    pixel4 dc7 = PIXEL_SPLAT_X4( ( s1 + s5 + 4 ) >> 3 );

    for( int y = 0; y < 4; y++ )
    {
        MPIXEL_X4( src+0 ) = dc0;
        MPIXEL_X4( src+4 ) = dc1;
        src += FDEC_STRIDE;
    }
    for( int y = 0; y < 4; y++ )
    {
        MPIXEL_X4( src+0 ) = dc2;
        MPIXEL_X4( src+4 ) = dc3;
        src += FDEC_STRIDE;
    }
    for( int y = 0; y < 4; y++ )
    {
        MPIXEL_X4( src+0 ) = dc4;
        MPIXEL_X4( src+4 ) = dc5;
        src += FDEC_STRIDE;
    }
    for( int y = 0; y < 4; y++ )
    {
        MPIXEL_X4( src+0 ) = dc6;
        MPIXEL_X4( src+4 ) = dc7;
        src += FDEC_STRIDE;
    }
}
void x264_predict_8x16c_h_c( pixel *src )
{
    for( int i = 0; i < 16; i++ )
    {
        pixel4 v = PIXEL_SPLAT_X4( src[-1] );
        MPIXEL_X4( src+0 ) = v;
        MPIXEL_X4( src+4 ) = v;
        src += FDEC_STRIDE;
    }
}
void x264_predict_8x16c_v_c( pixel *src )
{
    pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE );
    pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE );

    for( int i = 0; i < 16; i++ )
    {
        MPIXEL_X4( src+0 ) = v0;
        MPIXEL_X4( src+4 ) = v1;
        src += FDEC_STRIDE;
    }
}
void x264_predict_8x16c_p_c( pixel *src )
{
    int H = 0;
    int V = 0;

    for( int i = 0; i < 4; i++ )
        H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );
    for( int i = 0; i < 8; i++ )
        V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );

    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
    int b = ( 17 * H + 16 ) >> 5;
    int c = ( 5 * V + 32 ) >> 6;
    int i00 = a -3*b -7*c + 16;

    for( int y = 0; y < 16; y++ )
    {
        int pix = i00;
        for( int x = 0; x < 8; x++ )
        {
            src[x] = x264_clip_pixel( pix>>5 );
            pix += b;
        }
        src += FDEC_STRIDE;
        i00 += c;
    }
}

/****************************************************************************
 * 4x4 prediction for intra luma block
 ****************************************************************************/

#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
#define SRC_X4(x,y) MPIXEL_X4( &SRC(x,y) )

#define PREDICT_4x4_DC(v)\
    SRC_X4(0,0) = SRC_X4(0,1) = SRC_X4(0,2) = SRC_X4(0,3) = v;

static void x264_predict_4x4_dc_128_c( pixel *src )
{
    PREDICT_4x4_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
}
static void x264_predict_4x4_dc_left_c( pixel *src )
{
    pixel4 dc = PIXEL_SPLAT_X4( (SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) + 2) >> 2 );
    PREDICT_4x4_DC( dc );
}
static void x264_predict_4x4_dc_top_c( pixel *src )
{
    pixel4 dc = PIXEL_SPLAT_X4( (SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 2) >> 2 );
    PREDICT_4x4_DC( dc );
}
void x264_predict_4x4_dc_c( pixel *src )
{
    pixel4 dc = PIXEL_SPLAT_X4( (SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) +
                                 SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 4) >> 3 );
    PREDICT_4x4_DC( dc );
}
void x264_predict_4x4_h_c( pixel *src )
{
    SRC_X4(0,0) = PIXEL_SPLAT_X4( SRC(-1,0) );
    SRC_X4(0,1) = PIXEL_SPLAT_X4( SRC(-1,1) );
    SRC_X4(0,2) = PIXEL_SPLAT_X4( SRC(-1,2) );
    SRC_X4(0,3) = PIXEL_SPLAT_X4( SRC(-1,3) );
}
void x264_predict_4x4_v_c( pixel *src )
{
    PREDICT_4x4_DC(SRC_X4(0,-1));
}

#define PREDICT_4x4_LOAD_LEFT\
    int l0 = SRC(-1,0);\
    int l1 = SRC(-1,1);\
    int l2 = SRC(-1,2);\
    UNUSED int l3 = SRC(-1,3);

#define PREDICT_4x4_LOAD_TOP\
    int t0 = SRC(0,-1);\
    int t1 = SRC(1,-1);\
    int t2 = SRC(2,-1);\
    UNUSED int t3 = SRC(3,-1);

#define PREDICT_4x4_LOAD_TOP_RIGHT\
    int t4 = SRC(4,-1);\
    int t5 = SRC(5,-1);\
    int t6 = SRC(6,-1);\
    UNUSED int t7 = SRC(7,-1);

#define F1(a,b)   (((a)+(b)+1)>>1)
#define F2(a,b,c) (((a)+2*(b)+(c)+2)>>2)

static void x264_predict_4x4_ddl_c( pixel *src )
{
    PREDICT_4x4_LOAD_TOP
    PREDICT_4x4_LOAD_TOP_RIGHT
    SRC(0,0)= F2(t0,t1,t2);
    SRC(1,0)=SRC(0,1)= F2(t1,t2,t3);
    SRC(2,0)=SRC(1,1)=SRC(0,2)= F2(t2,t3,t4);
    SRC(3,0)=SRC(2,1)=SRC(1,2)=SRC(0,3)= F2(t3,t4,t5);
    SRC(3,1)=SRC(2,2)=SRC(1,3)= F2(t4,t5,t6);
    SRC(3,2)=SRC(2,3)= F2(t5,t6,t7);
    SRC(3,3)= F2(t6,t7,t7);
}
static void x264_predict_4x4_ddr_c( pixel *src )
{
    int lt = SRC(-1,-1);
    PREDICT_4x4_LOAD_LEFT
    PREDICT_4x4_LOAD_TOP
    SRC(3,0)= F2(t3,t2,t1);
    SRC(2,0)=SRC(3,1)= F2(t2,t1,t0);
    SRC(1,0)=SRC(2,1)=SRC(3,2)= F2(t1,t0,lt);
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)= F2(t0,lt,l0);
    SRC(0,1)=SRC(1,2)=SRC(2,3)= F2(lt,l0,l1);
    SRC(0,2)=SRC(1,3)= F2(l0,l1,l2);
    SRC(0,3)= F2(l1,l2,l3);
}

static void x264_predict_4x4_vr_c( pixel *src )
{
    int lt = SRC(-1,-1);
    PREDICT_4x4_LOAD_LEFT
    PREDICT_4x4_LOAD_TOP
    SRC(0,3)= F2(l2,l1,l0);
    SRC(0,2)= F2(l1,l0,lt);
    SRC(0,1)=SRC(1,3)= F2(l0,lt,t0);
    SRC(0,0)=SRC(1,2)= F1(lt,t0);
    SRC(1,1)=SRC(2,3)= F2(lt,t0,t1);
    SRC(1,0)=SRC(2,2)= F1(t0,t1);
    SRC(2,1)=SRC(3,3)= F2(t0,t1,t2);
    SRC(2,0)=SRC(3,2)= F1(t1,t2);
    SRC(3,1)= F2(t1,t2,t3);
    SRC(3,0)= F1(t2,t3);
}

static void x264_predict_4x4_hd_c( pixel *src )
{
    int lt= SRC(-1,-1);
    PREDICT_4x4_LOAD_LEFT
    PREDICT_4x4_LOAD_TOP
    SRC(0,3)= F1(l2,l3);
    SRC(1,3)= F2(l1,l2,l3);
    SRC(0,2)=SRC(2,3)= F1(l1,l2);
    SRC(1,2)=SRC(3,3)= F2(l0,l1,l2);
    SRC(0,1)=SRC(2,2)= F1(l0,l1);
    SRC(1,1)=SRC(3,2)= F2(lt,l0,l1);
    SRC(0,0)=SRC(2,1)= F1(lt,l0);
    SRC(1,0)=SRC(3,1)= F2(t0,lt,l0);
    SRC(2,0)= F2(t1,t0,lt);
    SRC(3,0)= F2(t2,t1,t0);
}

static void x264_predict_4x4_vl_c( pixel *src )
{
    PREDICT_4x4_LOAD_TOP
    PREDICT_4x4_LOAD_TOP_RIGHT
    SRC(0,0)= F1(t0,t1);
    SRC(0,1)= F2(t0,t1,t2);
    SRC(1,0)=SRC(0,2)= F1(t1,t2);
    SRC(1,1)=SRC(0,3)= F2(t1,t2,t3);
    SRC(2,0)=SRC(1,2)= F1(t2,t3);
    SRC(2,1)=SRC(1,3)= F2(t2,t3,t4);
    SRC(3,0)=SRC(2,2)= F1(t3,t4);
    SRC(3,1)=SRC(2,3)= F2(t3,t4,t5);
    SRC(3,2)= F1(t4,t5);
    SRC(3,3)= F2(t4,t5,t6);
}

static void x264_predict_4x4_hu_c( pixel *src )
{
    PREDICT_4x4_LOAD_LEFT
    SRC(0,0)= F1(l0,l1);
    SRC(1,0)= F2(l0,l1,l2);
    SRC(2,0)=SRC(0,1)= F1(l1,l2);
    SRC(3,0)=SRC(1,1)= F2(l1,l2,l3);
    SRC(2,1)=SRC(0,2)= F1(l2,l3);
    SRC(3,1)=SRC(1,2)= F2(l2,l3,l3);
    SRC(3,2)=SRC(1,3)=SRC(0,3)=
    SRC(2,2)=SRC(2,3)=SRC(3,3)= l3;
}

/****************************************************************************
 * 8x8 prediction for intra luma block
 ****************************************************************************/

#define PL(y) \
    edge[14-y] = F2(SRC(-1,y-1), SRC(-1,y), SRC(-1,y+1));
#define PT(x) \
    edge[16+x] = F2(SRC(x-1,-1), SRC(x,-1), SRC(x+1,-1));

static void x264_predict_8x8_filter_c( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
{
    /* edge[7..14] = l7..l0
     * edge[15] = lt
     * edge[16..31] = t0 .. t15
     * edge[32] = t15 */

    int have_lt = i_neighbor & MB_TOPLEFT;
    if( i_filters & MB_LEFT )
    {
        edge[15] = (SRC(0,-1) + 2*SRC(-1,-1) + SRC(-1,0) + 2) >> 2;
        edge[14] = ((have_lt ? SRC(-1,-1) : SRC(-1,0))
                 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2;
        PL(1) PL(2) PL(3) PL(4) PL(5) PL(6)
        edge[6] =
        edge[7] = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
    }

    if( i_filters & MB_TOP )
    {
        int have_tr = i_neighbor & MB_TOPRIGHT;
        edge[16] = ((have_lt ? SRC(-1,-1) : SRC(0,-1))
                 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2;
        PT(1) PT(2) PT(3) PT(4) PT(5) PT(6)
        edge[23] = (SRC(6,-1) + 2*SRC(7,-1)
                 + (have_tr ? SRC(8,-1) : SRC(7,-1)) + 2) >> 2;

        if( i_filters & MB_TOPRIGHT )
        {
            if( have_tr )
            {
                PT(8) PT(9) PT(10) PT(11) PT(12) PT(13) PT(14)
                edge[31] =
                edge[32] = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2;
            }
            else
            {
                MPIXEL_X4( edge+24 ) = PIXEL_SPLAT_X4( SRC(7,-1) );
                MPIXEL_X4( edge+28 ) = PIXEL_SPLAT_X4( SRC(7,-1) );
                edge[32] = SRC(7,-1);
            }
        }
    }
}

#undef PL
#undef PT

#define PL(y) \
    UNUSED int l##y = edge[14-y];
#define PT(x) \
    UNUSED int t##x = edge[16+x];
#define PREDICT_8x8_LOAD_TOPLEFT \
    int lt = edge[15];
#define PREDICT_8x8_LOAD_LEFT \
    PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7)
#define PREDICT_8x8_LOAD_TOP \
    PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7)
#define PREDICT_8x8_LOAD_TOPRIGHT \
    PT(8) PT(9) PT(10) PT(11) PT(12) PT(13) PT(14) PT(15)

#define PREDICT_8x8_DC(v) \
    for( int y = 0; y < 8; y++ ) { \
        MPIXEL_X4( src+0 ) = v; \
        MPIXEL_X4( src+4 ) = v; \
        src += FDEC_STRIDE; \
    }

static void x264_predict_8x8_dc_128_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
}
static void x264_predict_8x8_dc_left_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_LEFT
    pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3 );
    PREDICT_8x8_DC( dc );
}
static void x264_predict_8x8_dc_top_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_TOP
    pixel4 dc = PIXEL_SPLAT_X4( (t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3 );
    PREDICT_8x8_DC( dc );
}
void x264_predict_8x8_dc_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_LEFT
    PREDICT_8x8_LOAD_TOP
    pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4 );
    PREDICT_8x8_DC( dc );
}
void x264_predict_8x8_h_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_LEFT
#define ROW(y) MPIXEL_X4( src+y*FDEC_STRIDE+0 ) =\
               MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = PIXEL_SPLAT_X4( l##y );
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
#undef ROW
}
void x264_predict_8x8_v_c( pixel *src, pixel edge[36] )
{
    pixel4 top[2] = { MPIXEL_X4( edge+16 ),
                      MPIXEL_X4( edge+20 ) };
    for( int y = 0; y < 8; y++ )
    {
        MPIXEL_X4( src+y*FDEC_STRIDE+0 ) = top[0];
        MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = top[1];
    }
}
static void x264_predict_8x8_ddl_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_TOP
    PREDICT_8x8_LOAD_TOPRIGHT
    SRC(0,0)= F2(t0,t1,t2);
    SRC(0,1)=SRC(1,0)= F2(t1,t2,t3);
    SRC(0,2)=SRC(1,1)=SRC(2,0)= F2(t2,t3,t4);
    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= F2(t3,t4,t5);
    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= F2(t4,t5,t6);
    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= F2(t5,t6,t7);
    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= F2(t6,t7,t8);
    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= F2(t7,t8,t9);
    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= F2(t8,t9,t10);
    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= F2(t9,t10,t11);
    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= F2(t10,t11,t12);
    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= F2(t11,t12,t13);
    SRC(5,7)=SRC(6,6)=SRC(7,5)= F2(t12,t13,t14);
    SRC(6,7)=SRC(7,6)= F2(t13,t14,t15);
    SRC(7,7)= F2(t14,t15,t15);
}
static void x264_predict_8x8_ddr_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_TOP
    PREDICT_8x8_LOAD_LEFT
    PREDICT_8x8_LOAD_TOPLEFT
    SRC(0,7)= F2(l7,l6,l5);
    SRC(0,6)=SRC(1,7)= F2(l6,l5,l4);
    SRC(0,5)=SRC(1,6)=SRC(2,7)= F2(l5,l4,l3);
    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= F2(l4,l3,l2);
    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= F2(l3,l2,l1);
    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= F2(l2,l1,l0);
    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= F2(l1,l0,lt);
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= F2(l0,lt,t0);
    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= F2(lt,t0,t1);
    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= F2(t0,t1,t2);
    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= F2(t1,t2,t3);
    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= F2(t2,t3,t4);
    SRC(5,0)=SRC(6,1)=SRC(7,2)= F2(t3,t4,t5);
    SRC(6,0)=SRC(7,1)= F2(t4,t5,t6);
    SRC(7,0)= F2(t5,t6,t7);

}
static void x264_predict_8x8_vr_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_TOP
    PREDICT_8x8_LOAD_LEFT
    PREDICT_8x8_LOAD_TOPLEFT
    SRC(0,6)= F2(l5,l4,l3);
    SRC(0,7)= F2(l6,l5,l4);
    SRC(0,4)=SRC(1,6)= F2(l3,l2,l1);
    SRC(0,5)=SRC(1,7)= F2(l4,l3,l2);
    SRC(0,2)=SRC(1,4)=SRC(2,6)= F2(l1,l0,lt);
    SRC(0,3)=SRC(1,5)=SRC(2,7)= F2(l2,l1,l0);
    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= F2(l0,lt,t0);
    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= F1(lt,t0);
    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= F2(lt,t0,t1);
    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= F1(t0,t1);
    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= F2(t0,t1,t2);
    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= F1(t1,t2);
    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= F2(t1,t2,t3);
    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= F1(t2,t3);
    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= F2(t2,t3,t4);
    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= F1(t3,t4);
    SRC(5,1)=SRC(6,3)=SRC(7,5)= F2(t3,t4,t5);
    SRC(5,0)=SRC(6,2)=SRC(7,4)= F1(t4,t5);
    SRC(6,1)=SRC(7,3)= F2(t4,t5,t6);
    SRC(6,0)=SRC(7,2)= F1(t5,t6);
    SRC(7,1)= F2(t5,t6,t7);
    SRC(7,0)= F1(t6,t7);
}
static void x264_predict_8x8_hd_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_TOP
    PREDICT_8x8_LOAD_LEFT
    PREDICT_8x8_LOAD_TOPLEFT
    int p1 = pack_pixel_1to2(F1(l6,l7), F2(l5,l6,l7));
    int p2 = pack_pixel_1to2(F1(l5,l6), F2(l4,l5,l6));
    int p3 = pack_pixel_1to2(F1(l4,l5), F2(l3,l4,l5));
    int p4 = pack_pixel_1to2(F1(l3,l4), F2(l2,l3,l4));
    int p5 = pack_pixel_1to2(F1(l2,l3), F2(l1,l2,l3));
    int p6 = pack_pixel_1to2(F1(l1,l2), F2(l0,l1,l2));
    int p7 = pack_pixel_1to2(F1(l0,l1), F2(lt,l0,l1));
    int p8 = pack_pixel_1to2(F1(lt,l0), F2(l0,lt,t0));
    int p9 = pack_pixel_1to2(F2(t1,t0,lt), F2(t2,t1,t0));
    int p10 = pack_pixel_1to2(F2(t3,t2,t1), F2(t4,t3,t2));
    int p11 = pack_pixel_1to2(F2(t5,t4,t3), F2(t6,t5,t4));
    SRC_X4(0,7)= pack_pixel_2to4(p1,p2);
    SRC_X4(0,6)= pack_pixel_2to4(p2,p3);
    SRC_X4(4,7)=SRC_X4(0,5)= pack_pixel_2to4(p3,p4);
    SRC_X4(4,6)=SRC_X4(0,4)= pack_pixel_2to4(p4,p5);
    SRC_X4(4,5)=SRC_X4(0,3)= pack_pixel_2to4(p5,p6);
    SRC_X4(4,4)=SRC_X4(0,2)= pack_pixel_2to4(p6,p7);
    SRC_X4(4,3)=SRC_X4(0,1)= pack_pixel_2to4(p7,p8);
    SRC_X4(4,2)=SRC_X4(0,0)= pack_pixel_2to4(p8,p9);
    SRC_X4(4,1)= pack_pixel_2to4(p9,p10);
    SRC_X4(4,0)= pack_pixel_2to4(p10,p11);
}
static void x264_predict_8x8_vl_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_TOP
    PREDICT_8x8_LOAD_TOPRIGHT
    SRC(0,0)= F1(t0,t1);
    SRC(0,1)= F2(t0,t1,t2);
    SRC(0,2)=SRC(1,0)= F1(t1,t2);
    SRC(0,3)=SRC(1,1)= F2(t1,t2,t3);
    SRC(0,4)=SRC(1,2)=SRC(2,0)= F1(t2,t3);
    SRC(0,5)=SRC(1,3)=SRC(2,1)= F2(t2,t3,t4);
    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= F1(t3,t4);
    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= F2(t3,t4,t5);
    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= F1(t4,t5);
    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= F2(t4,t5,t6);
    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= F1(t5,t6);
    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= F2(t5,t6,t7);
    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= F1(t6,t7);
    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= F2(t6,t7,t8);
    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= F1(t7,t8);
    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= F2(t7,t8,t9);
    SRC(5,6)=SRC(6,4)=SRC(7,2)= F1(t8,t9);
    SRC(5,7)=SRC(6,5)=SRC(7,3)= F2(t8,t9,t10);
    SRC(6,6)=SRC(7,4)= F1(t9,t10);
    SRC(6,7)=SRC(7,5)= F2(t9,t10,t11);
    SRC(7,6)= F1(t10,t11);
    SRC(7,7)= F2(t10,t11,t12);
}
static void x264_predict_8x8_hu_c( pixel *src, pixel edge[36] )
{
    PREDICT_8x8_LOAD_LEFT
    int p1 = pack_pixel_1to2(F1(l0,l1), F2(l0,l1,l2));
    int p2 = pack_pixel_1to2(F1(l1,l2), F2(l1,l2,l3));
    int p3 = pack_pixel_1to2(F1(l2,l3), F2(l2,l3,l4));
    int p4 = pack_pixel_1to2(F1(l3,l4), F2(l3,l4,l5));
    int p5 = pack_pixel_1to2(F1(l4,l5), F2(l4,l5,l6));
    int p6 = pack_pixel_1to2(F1(l5,l6), F2(l5,l6,l7));
    int p7 = pack_pixel_1to2(F1(l6,l7), F2(l6,l7,l7));
    int p8 = pack_pixel_1to2(l7,l7);
    SRC_X4(0,0)= pack_pixel_2to4(p1,p2);
    SRC_X4(0,1)= pack_pixel_2to4(p2,p3);
    SRC_X4(4,0)=SRC_X4(0,2)= pack_pixel_2to4(p3,p4);
    SRC_X4(4,1)=SRC_X4(0,3)= pack_pixel_2to4(p4,p5);
    SRC_X4(4,2)=SRC_X4(0,4)= pack_pixel_2to4(p5,p6);
    SRC_X4(4,3)=SRC_X4(0,5)= pack_pixel_2to4(p6,p7);
    SRC_X4(4,4)=SRC_X4(0,6)= pack_pixel_2to4(p7,p8);
    SRC_X4(4,5)=SRC_X4(4,6)= SRC_X4(0,7) = SRC_X4(4,7) = pack_pixel_2to4(p8,p8);
}

/****************************************************************************
 * Exported functions:
 ****************************************************************************/
void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
{
    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_c;
    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_c;
    pf[I_PRED_16x16_DC]     = x264_predict_16x16_dc_c;
    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_c;
    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_c;
    pf[I_PRED_16x16_DC_TOP ]= x264_predict_16x16_dc_top_c;
    pf[I_PRED_16x16_DC_128 ]= x264_predict_16x16_dc_128_c;

#if HAVE_MMX
    x264_predict_16x16_init_mmx( cpu, pf );
#endif

#if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
        x264_predict_16x16_init_altivec( pf );
#endif

#if HAVE_ARMV6
    x264_predict_16x16_init_arm( cpu, pf );
#endif
}

void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
{
    pf[I_PRED_CHROMA_V ]     = x264_predict_8x8c_v_c;
    pf[I_PRED_CHROMA_H ]     = x264_predict_8x8c_h_c;
    pf[I_PRED_CHROMA_DC]     = x264_predict_8x8c_dc_c;
    pf[I_PRED_CHROMA_P ]     = x264_predict_8x8c_p_c;
    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x8c_dc_left_c;
    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x8c_dc_top_c;
    pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x8c_dc_128_c;

#if HAVE_MMX
    x264_predict_8x8c_init_mmx( cpu, pf );
#endif

#if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
        x264_predict_8x8c_init_altivec( pf );
#endif

#if HAVE_ARMV6
    x264_predict_8x8c_init_arm( cpu, pf );
#endif
}

void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
{
    pf[I_PRED_CHROMA_V ]     = x264_predict_8x16c_v_c;
    pf[I_PRED_CHROMA_H ]     = x264_predict_8x16c_h_c;
    pf[I_PRED_CHROMA_DC]     = x264_predict_8x16c_dc_c;
    pf[I_PRED_CHROMA_P ]     = x264_predict_8x16c_p_c;
    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_c;
    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_c;
    pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x16c_dc_128_c;

#if HAVE_MMX
    x264_predict_8x16c_init_mmx( cpu, pf );
#endif
}

void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
    pf[I_PRED_8x8_V]      = x264_predict_8x8_v_c;
    pf[I_PRED_8x8_H]      = x264_predict_8x8_h_c;
    pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_c;
    pf[I_PRED_8x8_DDL]    = x264_predict_8x8_ddl_c;
    pf[I_PRED_8x8_DDR]    = x264_predict_8x8_ddr_c;
    pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_c;
    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_c;
    pf[I_PRED_8x8_VL]     = x264_predict_8x8_vl_c;
    pf[I_PRED_8x8_HU]     = x264_predict_8x8_hu_c;
    pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_c;
    pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_c;
    pf[I_PRED_8x8_DC_128] = x264_predict_8x8_dc_128_c;
    *predict_filter       = x264_predict_8x8_filter_c;

#if HAVE_MMX
    x264_predict_8x8_init_mmx( cpu, pf, predict_filter );
#endif

#if HAVE_ARMV6
    x264_predict_8x8_init_arm( cpu, pf, predict_filter );
#endif
}

void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
{
    pf[I_PRED_4x4_V]      = x264_predict_4x4_v_c;
    pf[I_PRED_4x4_H]      = x264_predict_4x4_h_c;
    pf[I_PRED_4x4_DC]     = x264_predict_4x4_dc_c;
    pf[I_PRED_4x4_DDL]    = x264_predict_4x4_ddl_c;
    pf[I_PRED_4x4_DDR]    = x264_predict_4x4_ddr_c;
    pf[I_PRED_4x4_VR]     = x264_predict_4x4_vr_c;
    pf[I_PRED_4x4_HD]     = x264_predict_4x4_hd_c;
    pf[I_PRED_4x4_VL]     = x264_predict_4x4_vl_c;
    pf[I_PRED_4x4_HU]     = x264_predict_4x4_hu_c;
    pf[I_PRED_4x4_DC_LEFT]= x264_predict_4x4_dc_left_c;
    pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_c;
    pf[I_PRED_4x4_DC_128] = x264_predict_4x4_dc_128_c;

#if HAVE_MMX
    x264_predict_4x4_init_mmx( cpu, pf );
#endif

#if HAVE_ARMV6
    x264_predict_4x4_init_arm( cpu, pf );
#endif
}

x264-snapshot-20120103-2245-stable/common/ppc/0000755000175000001440000000000011700673342017743 5ustar  videolanusersx264-snapshot-20120103-2245-stable/common/ppc/quant.h0000644000175000001440000000332111700673342021243 0ustar  videolanusers/*****************************************************************************
 * quant.c: ppc quantization
 *****************************************************************************
 * Copyright (C) 2007-2011 x264 project
 *
 * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_PPC_QUANT_H
#define X264_PPC_QUANT_H

int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );

int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias );
int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias );

void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp );
#endif
x264-snapshot-20120103-2245-stable/common/ppc/quant.c0000644000175000001440000003347511700673342021253 0ustar  videolanusers/*****************************************************************************
 * quant.c: ppc quantization
 *****************************************************************************
 * Copyright (C) 2007-2011 x264 project
 *
 * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "ppccommon.h"
#include "quant.h"

#if !HIGH_BIT_DEPTH
// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U( idx0, idx1 )                                    \
{                                                                   \
    temp1v = vec_ld((idx0), dct);                                   \
    temp2v = vec_ld((idx1), dct);                                   \
    mfvA = vec_ld((idx0), mf);                                      \
    mfvB = vec_ld((idx1), mf);                                      \
    biasvA = vec_ld((idx0), bias);                                  \
    biasvB = vec_ld((idx1), bias);                                  \
    mskA = vec_cmplt(temp1v, zero_s16v);                            \
    mskB = vec_cmplt(temp2v, zero_s16v);                            \
    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
    coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
    coefvA = vec_adds(coefvA, biasvA);                              \
    coefvB = vec_adds(coefvB, biasvB);                              \
    multEvenvA = vec_mule(coefvA, mfvA);                            \
    multOddvA = vec_mulo(coefvA, mfvA);                             \
    multEvenvB = vec_mule(coefvB, mfvB);                            \
    multOddvB = vec_mulo(coefvB, mfvB);                             \
    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
    multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
    multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
    temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
    temp1v = vec_xor(temp1v, mskA);                                 \
    temp2v = vec_xor(temp2v, mskB);                                 \
    temp1v = vec_adds(temp1v, vec_and(mskA, one));                  \
    vec_st(temp1v, (idx0), dct);                                    \
    temp2v = vec_adds(temp2v, vec_and(mskB, one));                  \
    nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
    vec_st(temp2v, (idx1), dct);                                    \
}

int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
{
    LOAD_ZERO;
    vector bool short mskA;
    vec_u32_t i_qbitsv;
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
    vec_u16_t mfvA;
    vec_u16_t biasvA;
    vec_s16_t one = vec_splat_s16(1);;
    vec_s16_t nz = zero_s16v;

    vector bool short mskB;
    vec_u16_t coefvB;
    vec_u32_t multEvenvB, multOddvB;
    vec_u16_t mfvB;
    vec_u16_t biasvB;

    vec_s16_t temp1v, temp2v;

    vec_u32_u qbits_u;
    qbits_u.s[0]=16;
    i_qbitsv = vec_splat(qbits_u.v, 0);

    QUANT_16_U( 0, 16 );
    return vec_any_ne(nz, zero_s16v);
}

// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U_DC( idx0, idx1 )                                 \
{                                                                   \
    temp1v = vec_ld((idx0), dct);                                   \
    temp2v = vec_ld((idx1), dct);                                   \
    mskA = vec_cmplt(temp1v, zero_s16v);                            \
    mskB = vec_cmplt(temp2v, zero_s16v);                            \
    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
    coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
    coefvA = vec_add(coefvA, biasv);                                \
    coefvB = vec_add(coefvB, biasv);                                \
    multEvenvA = vec_mule(coefvA, mfv);                             \
    multOddvA = vec_mulo(coefvA, mfv);                              \
    multEvenvB = vec_mule(coefvB, mfv);                             \
    multOddvB = vec_mulo(coefvB, mfv);                              \
    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
    multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
    multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
    temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
    temp1v = vec_xor(temp1v, mskA);                                 \
    temp2v = vec_xor(temp2v, mskB);                                 \
    temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
    vec_st(temp1v, (idx0), dct);                                    \
    temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
    nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
    vec_st(temp2v, (idx1), dct);                                    \
}

int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
{
    LOAD_ZERO;
    vector bool short mskA;
    vec_u32_t i_qbitsv;
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
    vec_s16_t one = vec_splat_s16(1);
    vec_s16_t nz = zero_s16v;

    vector bool short mskB;
    vec_u16_t coefvB;
    vec_u32_t multEvenvB, multOddvB;

    vec_s16_t temp1v, temp2v;

    vec_u16_t mfv;
    vec_u16_t biasv;

    vec_u16_u mf_u;
    mf_u.s[0]=mf;
    mfv = vec_splat( mf_u.v, 0 );

    vec_u32_u qbits_u;
    qbits_u.s[0]=16;
    i_qbitsv = vec_splat(qbits_u.v, 0);

    vec_u16_u bias_u;
    bias_u.s[0]=bias;
    biasv = vec_splat(bias_u.v, 0);

    QUANT_16_U_DC( 0, 16 );
    return vec_any_ne(nz, zero_s16v);
}

// DC quant of a whole 2x2 block
#define QUANT_4_U_DC( idx0 )                                        \
{                                                                   \
    const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0);      \
    temp1v = vec_ld((idx0), dct);                                   \
    mskA = vec_cmplt(temp1v, zero_s16v);                            \
    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
    coefvA = vec_add(coefvA, biasv);                                \
    multEvenvA = vec_mule(coefvA, mfv);                             \
    multOddvA = vec_mulo(coefvA, mfv);                              \
    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
    temp2v = vec_xor(temp2v, mskA);                                 \
    temp2v = vec_add(temp2v, vec_and(mskA, one));                   \
    temp1v = vec_sel(temp1v, temp2v, sel);                          \
    nz = vec_or(nz, temp1v);                                        \
    vec_st(temp1v, (idx0), dct);                                    \
}

int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
{
    LOAD_ZERO;
    vector bool short mskA;
    vec_u32_t i_qbitsv;
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
    vec_s16_t one = vec_splat_s16(1);
    vec_s16_t nz = zero_s16v;

    vec_s16_t temp1v, temp2v;

    vec_u16_t mfv;
    vec_u16_t biasv;

    vec_u16_u mf_u;
    mf_u.s[0]=mf;
    mfv = vec_splat( mf_u.v, 0 );

    vec_u32_u qbits_u;
    qbits_u.s[0]=16;
    i_qbitsv = vec_splat(qbits_u.v, 0);

    vec_u16_u bias_u;
    bias_u.s[0]=bias;
    biasv = vec_splat(bias_u.v, 0);

    static const vec_s16_t mask2 = CV(-1, -1, -1, -1,  0, 0, 0, 0);
    QUANT_4_U_DC(0);
    return vec_any_ne(vec_and(nz, mask2), zero_s16v);
}

int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
{
    LOAD_ZERO;
    vector bool short mskA;
    vec_u32_t i_qbitsv;
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
    vec_u16_t mfvA;
    vec_u16_t biasvA;
    vec_s16_t one = vec_splat_s16(1);;
    vec_s16_t nz = zero_s16v;

    vector bool short mskB;
    vec_u16_t coefvB;
    vec_u32_t multEvenvB, multOddvB;
    vec_u16_t mfvB;
    vec_u16_t biasvB;

    vec_s16_t temp1v, temp2v;

    vec_u32_u qbits_u;
    qbits_u.s[0]=16;
    i_qbitsv = vec_splat(qbits_u.v, 0);

    for( int i = 0; i < 4; i++ )
        QUANT_16_U( i*2*16, i*2*16+16 );
    return vec_any_ne(nz, zero_s16v);
}

#define DEQUANT_SHL()                                                \
{                                                                    \
    dctv = vec_ld(8*y, dct);                                         \
    mf1v = vec_ld(16*y, dequant_mf[i_mf]);                           \
    mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                        \
    mfv  = vec_packs(mf1v, mf2v);                                    \
                                                                     \
    multEvenvA = vec_mule(dctv, mfv);                                \
    multOddvA = vec_mulo(dctv, mfv);                                 \
    dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA),  \
                                 vec_mergel(multEvenvA, multOddvA)); \
    dctv = vec_sl(dctv, i_qbitsv);                                   \
    vec_st(dctv, 8*y, dct);                                          \
}

#define DEQUANT_SHR()                                          \
{                                                              \
    dctv = vec_ld(8*y, dct);                                   \
    dct1v = vec_mergeh(dctv, dctv);                            \
    dct2v = vec_mergel(dctv, dctv);                            \
    mf1v = vec_ld(16*y, dequant_mf[i_mf]);                     \
    mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                  \
                                                               \
    multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v);             \
    multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v);              \
    temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
    temp1v = vec_add(temp1v, fv);                              \
    temp1v = vec_sra(temp1v, i_qbitsv);                        \
                                                               \
    multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v);             \
    multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v);              \
    temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
    temp2v = vec_add(temp2v, fv);                              \
    temp2v = vec_sra(temp2v, i_qbitsv);                        \
                                                               \
    dctv = (vec_s16_t)vec_packs(temp1v, temp2v);               \
    vec_st(dctv, y*8, dct);                                    \
}

void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
{
    int i_mf = i_qp%6;
    int i_qbits = i_qp/6 - 4;

    vec_s16_t dctv;
    vec_s16_t dct1v, dct2v;
    vec_s32_t mf1v, mf2v;
    vec_s16_t mfv;
    vec_s32_t multEvenvA, multOddvA;
    vec_s32_t temp1v, temp2v;

    if( i_qbits >= 0 )
    {
        vec_u16_t i_qbitsv;
        vec_u16_u qbits_u;
        qbits_u.s[0]=i_qbits;
        i_qbitsv = vec_splat(qbits_u.v, 0);

        for( int y = 0; y < 4; y+=2 )
            DEQUANT_SHL();
    }
    else
    {
        const int f = 1 << (-i_qbits-1);

        vec_s32_t fv;
        vec_u32_u f_u;
        f_u.s[0]=f;
        fv = (vec_s32_t)vec_splat(f_u.v, 0);

        vec_u32_t i_qbitsv;
        vec_u32_u qbits_u;
        qbits_u.s[0]=-i_qbits;
        i_qbitsv = vec_splat(qbits_u.v, 0);

        vec_u32_t sixteenv;
        vec_u32_u sixteen_u;
        sixteen_u.s[0]=16;
        sixteenv = vec_splat(sixteen_u.v, 0);

        for( int y = 0; y < 4; y+=2 )
            DEQUANT_SHR();
    }
}

void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
{
    int i_mf = i_qp%6;
    int i_qbits = i_qp/6 - 6;

    vec_s16_t dctv;
    vec_s16_t dct1v, dct2v;
    vec_s32_t mf1v, mf2v;
    vec_s16_t mfv;
    vec_s32_t multEvenvA, multOddvA;
    vec_s32_t temp1v, temp2v;

    if( i_qbits >= 0 )
    {
        vec_u16_t i_qbitsv;
        vec_u16_u qbits_u;
        qbits_u.s[0]=i_qbits;
        i_qbitsv = vec_splat(qbits_u.v, 0);

        for( int y = 0; y < 16; y+=2 )
            DEQUANT_SHL();
    }
    else
    {
        const int f = 1 << (-i_qbits-1);

        vec_s32_t fv;
        vec_u32_u f_u;
        f_u.s[0]=f;
        fv = (vec_s32_t)vec_splat(f_u.v, 0);

        vec_u32_t i_qbitsv;
        vec_u32_u qbits_u;
        qbits_u.s[0]=-i_qbits;
        i_qbitsv = vec_splat(qbits_u.v, 0);

        vec_u32_t sixteenv;
        vec_u32_u sixteen_u;
        sixteen_u.s[0]=16;
        sixteenv = vec_splat(sixteen_u.v, 0);

        for( int y = 0; y < 16; y+=2 )
            DEQUANT_SHR();
    }
}
#endif // !HIGH_BIT_DEPTH

x264-snapshot-20120103-2245-stable/common/ppc/predict.h0000644000175000001440000000263411700673342021553 0ustar  videolanusers/*****************************************************************************
 * predict.h: ppc intra prediction
 *****************************************************************************
 * Copyright (C) 2007-2011 x264 project
 *
 * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_PPC_PREDICT_H
#define X264_PPC_PREDICT_H

void x264_predict_16x16_init_altivec ( x264_predict_t pf[7] );
void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] );

#endif /* X264_PPC_PREDICT_H */
x264-snapshot-20120103-2245-stable/common/ppc/predict.c0000644000175000001440000001555411700673342021553 0ustar  videolanusers/*****************************************************************************
 * predict.c: ppc intra prediction
 *****************************************************************************
 * Copyright (C) 2007-2011 x264 project
 *
 * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "predict.h"
#include "pixel.h"
#include "ppccommon.h"

#if !HIGH_BIT_DEPTH
static void predict_8x8c_p_altivec( uint8_t *src )
{
    int H = 0, V = 0;

    for( int i = 0; i < 4; i++ )
    {
        H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] );
        V += ( i + 1 ) * ( src[-1 +(i+4)*FDEC_STRIDE] - src[-1+(2-i)*FDEC_STRIDE] );
    }

    int a = 16 * ( src[-1+7*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
    int b = ( 17 * H + 16 ) >> 5;
    int c = ( 17 * V + 16 ) >> 5;
    int i00 = a -3*b -3*c + 16;

    vec_s16_u i00_u, b_u, c_u;
    i00_u.s[0] = i00;
    b_u.s[0]   = b;
    c_u.s[0]   = c;

    vec_u16_t val5_v = vec_splat_u16(5);
    vec_s16_t i00_v, b_v, c_v;
    i00_v = vec_splat(i00_u.v, 0);
    b_v = vec_splat(b_u.v, 0);
    c_v = vec_splat(c_u.v, 0);

    vec_s16_t induc_v  = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7);
    vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);

    PREP_STORE8;

    for( int i = 0; i < 8; ++i )
    {
        vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
        vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_0_v);
        VEC_STORE8(com_sat_v, &src[0]);
        src += FDEC_STRIDE;
        add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);

    }
}


/****************************************************************************
 * 16x16 prediction for intra luma block
 ****************************************************************************/

static void predict_16x16_p_altivec( uint8_t *src )
{
    int H = 0, V = 0;

    for( int i = 1; i <= 8; i++ )
    {
        H += i * ( src[7+i - FDEC_STRIDE ]  - src[7-i - FDEC_STRIDE ] );
        V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] );
    }

    int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
    int b = ( 5 * H + 32 ) >> 6;
    int c = ( 5 * V + 32 ) >> 6;
    int i00 = a - b * 7 - c * 7 + 16;

    vec_s16_u i00_u, b_u, c_u;
    i00_u.s[0] = i00;
    b_u.s[0]   = b;
    c_u.s[0]   = c;

    vec_u16_t val5_v = vec_splat_u16(5);
    vec_s16_t i00_v, b_v, c_v;
    i00_v = vec_splat(i00_u.v, 0);
    b_v = vec_splat(b_u.v, 0);
    c_v = vec_splat(c_u.v, 0);
    vec_s16_t induc_v  = (vec_s16_t) CV(0,  1,  2,  3,  4,  5,  6,  7);
    vec_s16_t b8_v = vec_sl(b_v, vec_splat_u16(3));
    vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);
    vec_s16_t add_i0_b_8v = vec_adds(b8_v, add_i0_b_0v);

    for( int y = 0; y < 16; y++ )
    {
        vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
        vec_s16_t shift_8_v = vec_sra(add_i0_b_8v, val5_v);
        vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v);
        vec_st( com_sat_v, 0, &src[0]);
        src += FDEC_STRIDE;
        add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
        add_i0_b_8v = vec_adds(add_i0_b_8v, c_v);
    }
}

#define PREDICT_16x16_DC_ALTIVEC(v) \
for( int i = 0; i < 16; i += 2)     \
{                                   \
    vec_st(v, 0, src);              \
    vec_st(v, FDEC_STRIDE, src);    \
    src += FDEC_STRIDE*2;           \
}

static void predict_16x16_dc_altivec( uint8_t *src )
{
    uint32_t dc = 0;

    for( int i = 0; i < 16; i++ )
    {
        dc += src[-1 + i * FDEC_STRIDE];
        dc += src[i - FDEC_STRIDE];
    }
    vec_u8_u v ; v.s[0] = (( dc + 16 ) >> 5);
    vec_u8_t bc_v = vec_splat(v.v, 0);

    PREDICT_16x16_DC_ALTIVEC(bc_v);
}

static void predict_16x16_dc_left_altivec( uint8_t *src )
{
    uint32_t dc = 0;

    for( int i = 0; i < 16; i++ )
        dc += src[-1 + i * FDEC_STRIDE];
    vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4);
    vec_u8_t bc_v = vec_splat(v.v, 0);

    PREDICT_16x16_DC_ALTIVEC(bc_v);
}

static void predict_16x16_dc_top_altivec( uint8_t *src )
{
    uint32_t dc = 0;

    for( int i = 0; i < 16; i++ )
        dc += src[i - FDEC_STRIDE];
    vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4);
    vec_u8_t bc_v = vec_splat(v.v, 0);

    PREDICT_16x16_DC_ALTIVEC(bc_v);
}

static void predict_16x16_dc_128_altivec( uint8_t *src )
{
    /* test if generating the constant is faster than loading it.
    vector unsigned int bc_v = (vector unsigned int)CV(0x80808080, 0x80808080, 0x80808080, 0x80808080);
    */
    vec_u8_t bc_v = vec_vslb((vec_u8_t)vec_splat_u8(1),(vec_u8_t)vec_splat_u8(7));
    PREDICT_16x16_DC_ALTIVEC(bc_v);
}

static void predict_16x16_h_altivec( uint8_t *src )
{
    for( int i = 0; i < 16; i++ )
    {
        vec_u8_t v = vec_ld(-1, src);
        vec_u8_t v_v = vec_splat(v, 15);
        vec_st(v_v, 0, src);

        src += FDEC_STRIDE;
    }
}

static void predict_16x16_v_altivec( uint8_t *src )
{
    vec_u32_u v;
    v.s[0] = *(uint32_t*)&src[ 0-FDEC_STRIDE];
    v.s[1] = *(uint32_t*)&src[ 4-FDEC_STRIDE];
    v.s[2] = *(uint32_t*)&src[ 8-FDEC_STRIDE];
    v.s[3] = *(uint32_t*)&src[12-FDEC_STRIDE];

    for( int i = 0; i < 16; i++ )
    {
        vec_st(v.v, 0, (uint32_t*)src);
        src += FDEC_STRIDE;
    }
}
#endif // !HIGH_BIT_DEPTH


/****************************************************************************
 * Exported functions:
 ****************************************************************************/
void x264_predict_16x16_init_altivec( x264_predict_t pf[7] )
{
#if !HIGH_BIT_DEPTH
    pf[I_PRED_16x16_V ]      = predict_16x16_v_altivec;
    pf[I_PRED_16x16_H ]      = predict_16x16_h_altivec;
    pf[I_PRED_16x16_DC]      = predict_16x16_dc_altivec;
    pf[I_PRED_16x16_P ]      = predict_16x16_p_altivec;
    pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_altivec;
    pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_altivec;
    pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_altivec;
#endif // !HIGH_BIT_DEPTH
}

void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] )
{
#if !HIGH_BIT_DEPTH
    pf[I_PRED_CHROMA_P]       = predict_8x8c_p_altivec;
#endif // !HIGH_BIT_DEPTH
}
x264-snapshot-20120103-2245-stable/common/ppc/ppccommon.h0000644000175000001440000003131311700673342022110 0ustar  videolanusers/*****************************************************************************
 * ppccommon.h: ppc utility macros
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Eric Petit <eric.petit@lapsus.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif

/***********************************************************************
 * For constant vectors, use parentheses on OS X and braces on Linux
 **********************************************************************/
#if defined(__APPLE__) && __GNUC__ < 4
#define CV(a...) (a)
#else
#define CV(a...) {a}
#endif

/***********************************************************************
 * Vector types
 **********************************************************************/
#define vec_u8_t  vector unsigned char
#define vec_s8_t  vector signed char
#define vec_u16_t vector unsigned short
#define vec_s16_t vector signed short
#define vec_u32_t vector unsigned int
#define vec_s32_t vector signed int

typedef union {
  uint32_t s[4];
  vec_u32_t v;
} vec_u32_u;

typedef union {
  uint16_t s[8];
  vec_u16_t v;
} vec_u16_u;

typedef union {
  int16_t s[8];
  vec_s16_t v;
} vec_s16_u;

typedef union {
  uint8_t s[16];
  vec_u8_t v;
} vec_u8_u;

/***********************************************************************
 * Null vector
 **********************************************************************/
#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )

#define zero_u8v  (vec_u8_t)  zerov
#define zero_s8v  (vec_s8_t)  zerov
#define zero_u16v (vec_u16_t) zerov
#define zero_s16v (vec_s16_t) zerov
#define zero_u32v (vec_u32_t) zerov
#define zero_s32v (vec_s32_t) zerov

/***********************************************************************
 * 8 <-> 16 bits conversions
 **********************************************************************/
#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )

#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)

#define vec_u16_to_u8(v) vec_pack( v, zero_u16v )
#define vec_s16_to_u8(v) vec_packsu( v, zero_s16v )


/***********************************************************************
 * 16 <-> 32 bits conversions
 **********************************************************************/
#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )

#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)

#define vec_u32_to_u16(v) vec_pack( v, zero_u32v )
#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )


/***********************************************************************
 * PREP_LOAD: declares two vectors required to perform unaligned loads
 * VEC_LOAD:  loads n bytes from u8 * p into vector v of type t where o is from original src offset
 * VEC_LOAD:_G: loads n bytes from u8 * p into vectory v of type t - use when offset is not known
 * VEC_LOAD_OFFSET: as above, but with offset vector known in advance
 **********************************************************************/
#define PREP_LOAD     \
    vec_u8_t _hv, _lv

#define PREP_LOAD_SRC( src )              \
    vec_u8_t _##src##_ = vec_lvsl(0, src)

#define VEC_LOAD_G( p, v, n, t )                 \
    _hv = vec_ld( 0, p );                        \
    v   = (t) vec_lvsl( 0, p );                  \
    _lv = vec_ld( n - 1, p );                    \
    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v )

#define VEC_LOAD( p, v, n, t, g )                   \
    _hv = vec_ld( 0, p );                           \
    _lv = vec_ld( n - 1, p );                       \
    v = (t) vec_perm( _hv, _lv, (vec_u8_t) _##g##_ )

#define VEC_LOAD_OFFSET( p, v, n, t, o )         \
    _hv = vec_ld( 0, p);                         \
    _lv = vec_ld( n - 1, p );                    \
    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) o )

#define VEC_LOAD_PARTIAL( p, v, n, t, g)               \
    _hv = vec_ld( 0, p);                               \
    v   = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )


/***********************************************************************
 * PREP_STORE##n: declares required vectors to store n bytes to a
 *                potentially unaligned address
 * VEC_STORE##n:  stores n bytes from vector v to address p
 **********************************************************************/
#define PREP_STORE16 \
    vec_u8_t _tmp1v  \

#define PREP_STORE16_DST( dst )             \
    vec_u8_t _##dst##l_ = vec_lvsl(0, dst); \
    vec_u8_t _##dst##r_ = vec_lvsr(0, dst);

#define VEC_STORE16( v, p, o )                           \
    _hv    = vec_ld( 0, p );                             \
    _lv    = vec_ld( 15, p );                            \
    _tmp1v = vec_perm( _lv, _hv, _##o##l_ );             \
    _lv    = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
    vec_st( _lv, 15, (uint8_t *) p );                    \
    _hv    = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
    vec_st( _hv, 0, (uint8_t *) p )


#define PREP_STORE8 \
    vec_u8_t _tmp3v \

#define VEC_STORE8( v, p )                \
    _tmp3v = vec_lvsl(0, p);              \
    v = vec_perm(v, v, _tmp3v);           \
    vec_ste((vec_u32_t)v,0,(uint32_t*)p); \
    vec_ste((vec_u32_t)v,4,(uint32_t*)p)


#define PREP_STORE4                                        \
    PREP_STORE16;                                          \
    vec_u8_t _tmp2v, _tmp3v;                               \
    const vec_u8_t sel =                                   \
        (vec_u8_t) CV(-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0)

#define VEC_STORE4( v, p )                      \
    _tmp3v = vec_lvsr( 0, p );                  \
    v      = vec_perm( v, v, _tmp3v );          \
    _lv    = vec_ld( 3, p );                    \
    _tmp1v = vec_perm( sel, zero_u8v, _tmp3v ); \
    _lv    = vec_sel( _lv, v, _tmp1v );         \
    vec_st( _lv, 3, p );                        \
    _hv    = vec_ld( 0, p );                    \
    _tmp2v = vec_perm( zero_u8v, sel, _tmp3v ); \
    _hv    = vec_sel( _hv, v, _tmp2v );         \
    vec_st( _hv, 0, p )

/***********************************************************************
 * VEC_TRANSPOSE_8
 ***********************************************************************
 * Transposes a 8x8 matrix of s16 vectors
 **********************************************************************/
#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
    b0 = vec_mergeh( a0, a4 ); \
    b1 = vec_mergel( a0, a4 ); \
    b2 = vec_mergeh( a1, a5 ); \
    b3 = vec_mergel( a1, a5 ); \
    b4 = vec_mergeh( a2, a6 ); \
    b5 = vec_mergel( a2, a6 ); \
    b6 = vec_mergeh( a3, a7 ); \
    b7 = vec_mergel( a3, a7 ); \
    a0 = vec_mergeh( b0, b4 ); \
    a1 = vec_mergel( b0, b4 ); \
    a2 = vec_mergeh( b1, b5 ); \
    a3 = vec_mergel( b1, b5 ); \
    a4 = vec_mergeh( b2, b6 ); \
    a5 = vec_mergel( b2, b6 ); \
    a6 = vec_mergeh( b3, b7 ); \
    a7 = vec_mergel( b3, b7 ); \
    b0 = vec_mergeh( a0, a4 ); \
    b1 = vec_mergel( a0, a4 ); \
    b2 = vec_mergeh( a1, a5 ); \
    b3 = vec_mergel( a1, a5 ); \
    b4 = vec_mergeh( a2, a6 ); \
    b5 = vec_mergel( a2, a6 ); \
    b6 = vec_mergeh( a3, a7 ); \
    b7 = vec_mergel( a3, a7 )

/***********************************************************************
 * VEC_TRANSPOSE_4
 ***********************************************************************
 * Transposes a 4x4 matrix of s16 vectors.
 * Actually source and destination are 8x4. The low elements of the
 * source are discarded and the low elements of the destination mustn't
 * be used.
 **********************************************************************/
#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
    b0 = vec_mergeh( a0, a0 ); \
    b1 = vec_mergeh( a1, a0 ); \
    b2 = vec_mergeh( a2, a0 ); \
    b3 = vec_mergeh( a3, a0 ); \
    a0 = vec_mergeh( b0, b2 ); \
    a1 = vec_mergel( b0, b2 ); \
    a2 = vec_mergeh( b1, b3 ); \
    a3 = vec_mergel( b1, b3 ); \
    b0 = vec_mergeh( a0, a2 ); \
    b1 = vec_mergel( a0, a2 ); \
    b2 = vec_mergeh( a1, a3 ); \
    b3 = vec_mergel( a1, a3 )

/***********************************************************************
 * VEC_DIFF_H
 ***********************************************************************
 * p1, p2:    u8 *
 * i1, i2, n: int
 * d:         s16v
 *
 * Loads n bytes from p1 and p2, do the diff of the high elements into
 * d, increments p1 and p2 by i1 and i2 into known offset g
 **********************************************************************/
#define PREP_DIFF           \
    LOAD_ZERO;              \
    PREP_LOAD;              \
    vec_s16_t pix1v, pix2v;


#define VEC_DIFF_H(p1,i1,p2,i2,n,d,g)               \
    VEC_LOAD_PARTIAL( p1, pix1v, n, vec_s16_t, p1); \
    pix1v = vec_u8_to_s16( pix1v );                 \
    VEC_LOAD( p2, pix2v, n, vec_s16_t, g);          \
    pix2v = vec_u8_to_s16( pix2v );                 \
    d     = vec_sub( pix1v, pix2v );                \
    p1   += i1;                                     \
    p2   += i2

#define VEC_DIFF_H_OFFSET(p1,i1,p2,i2,n,d,g1,g2)    \
    pix1v = (vec_s16_t)vec_perm( vec_ld( 0, p1 ), zero_u8v, _##g1##_ );\
    pix1v = vec_u8_to_s16( pix1v );                 \
    VEC_LOAD( p2, pix2v, n, vec_s16_t, g2);         \
    pix2v = vec_u8_to_s16( pix2v );                 \
    d     = vec_sub( pix1v, pix2v );                \
    p1   += i1;                                     \
    p2   += i2


/***********************************************************************
 * VEC_DIFF_HL
 ***********************************************************************
 * p1, p2: u8 *
 * i1, i2: int
 * dh, dl: s16v
 *
 * Loads 16 bytes from p1 and p2, do the diff of the high elements into
 * dh, the diff of the low elements into dl, increments p1 and p2 by i1
 * and i2
 **********************************************************************/
#define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl)       \
    pix1v = (vec_s16_t)vec_ld(0, p1);        \
    temp0v = vec_u8_to_s16_h( pix1v );       \
    temp1v = vec_u8_to_s16_l( pix1v );       \
    VEC_LOAD( p2, pix2v, 16, vec_s16_t, p2); \
    temp2v = vec_u8_to_s16_h( pix2v );       \
    temp3v = vec_u8_to_s16_l( pix2v );       \
    dh     = vec_sub( temp0v, temp2v );      \
    dl     = vec_sub( temp1v, temp3v );      \
    p1    += i1;                             \
    p2    += i2

/***********************************************************************
* VEC_DIFF_H_8BYTE_ALIGNED
***********************************************************************
* p1, p2:    u8 *
* i1, i2, n: int
* d:         s16v
*
* Loads n bytes from p1 and p2, do the diff of the high elements into
* d, increments p1 and p2 by i1 and i2
* Slightly faster when we know we are loading/diffing 8bytes which
* are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s
**********************************************************************/
#define PREP_DIFF_8BYTEALIGNED \
LOAD_ZERO;                     \
vec_s16_t pix1v, pix2v;        \
vec_u8_t pix1v8, pix2v8;       \
vec_u8_t permPix1, permPix2;   \
permPix1 = vec_lvsl(0, pix1);  \
permPix2 = vec_lvsl(0, pix2);  \

#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d)     \
pix1v8 = vec_perm(vec_ld(0,p1), zero_u8v, permPix1);  \
pix2v8 = vec_perm(vec_ld(0, p2), zero_u8v, permPix2); \
pix1v = vec_u8_to_s16( pix1v8 );                      \
pix2v = vec_u8_to_s16( pix2v8 );                      \
d = vec_sub( pix1v, pix2v);                           \
p1 += i1;                                             \
p2 += i2;
x264-snapshot-20120103-2245-stable/common/ppc/pixel.h0000644000175000001440000000246411700673342021243 0ustar  videolanusers/*****************************************************************************
 * pixel.h: ppc pixel metrics
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Eric Petit <eric.petit@lapsus.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_PPC_PIXEL_H
#define X264_PPC_PIXEL_H

void x264_pixel_altivec_init( x264_pixel_function_t *pixf );

#endif
x264-snapshot-20120103-2245-stable/common/ppc/pixel.c0000644000175000001440000023423611700673342021242 0ustar  videolanusers/*****************************************************************************
 * pixel.c: ppc pixel metrics
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Eric Petit <eric.petit@lapsus.org>
 *          Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "ppccommon.h"
#include "../predict.h"

#if !HIGH_BIT_DEPTH
/***********************************************************************
 * SAD routines
 **********************************************************************/

#define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b )        \
static int name( uint8_t *pix1, int i_pix1,            \
                 uint8_t *pix2, int i_pix2 )           \
{                                                      \
    ALIGNED_16( int sum );                             \
                                                       \
    LOAD_ZERO;                                         \
    PREP_LOAD;                                         \
    vec_u8_t  pix1v, pix2v;                            \
    vec_s32_t sumv = zero_s32v;                        \
    for( int y = 0; y < ly; y++ )                      \
    {                                                  \
        VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t );       \
        VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t );       \
        sumv = (vec_s32_t) vec_sum4s(                  \
                   vec_sub( vec_max( pix1v, pix2v ),   \
                            vec_min( pix1v, pix2v ) ), \
                   (vec_u32_t) sumv );                 \
        pix1 += i_pix1;                                \
        pix2 += i_pix2;                                \
    }                                                  \
    sumv = vec_sum##a( sumv, zero_s32v );              \
    sumv = vec_splat( sumv, b );                       \
    vec_ste( sumv, 0, &sum );                          \
    return sum;                                        \
}

PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s,  3 )
PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec,  8,  16, 2s, 1 )
PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec,  16, 8,  s,  3 )
PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )



/***********************************************************************
 * SATD routines
 **********************************************************************/

/***********************************************************************
 * VEC_HADAMAR
 ***********************************************************************
 * b[0] = a[0] + a[1] + a[2] + a[3]
 * b[1] = a[0] + a[1] - a[2] - a[3]
 * b[2] = a[0] - a[1] - a[2] + a[3]
 * b[3] = a[0] - a[1] + a[2] - a[3]
 **********************************************************************/
#define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \
    b2 = vec_add( a0, a1 ); \
    b3 = vec_add( a2, a3 ); \
    a0 = vec_sub( a0, a1 ); \
    a2 = vec_sub( a2, a3 ); \
    b0 = vec_add( b2, b3 ); \
    b1 = vec_sub( b2, b3 ); \
    b2 = vec_sub( a0, a2 ); \
    b3 = vec_add( a0, a2 )

/***********************************************************************
 * VEC_ABS
 ***********************************************************************
 * a: s16v
 *
 * a = abs(a)
 *
 * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
 * actually also calls vec_splat(0), but we already have a null vector.
 **********************************************************************/
#define VEC_ABS(a)                            \
    a = vec_max( a, vec_sub( zero_s16v, a ) );

#define VEC_ABSOLUTE(a) (vec_u16_t)vec_max( a, vec_sub( zero_s16v, a ) )

/***********************************************************************
 * VEC_ADD_ABS
 ***********************************************************************
 * a:    s16v
 * b, c: s32v
 *
 * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
 **********************************************************************/
#define VEC_ADD_ABS(a,b,c) \
    VEC_ABS( a );          \
    c = vec_sum4s( a, b )

/***********************************************************************
 * SATD 4x4
 **********************************************************************/
static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
                                   uint8_t *pix2, int i_pix2 )
{
    ALIGNED_16( int i_satd );

    PREP_DIFF;
    PREP_LOAD_SRC( pix1 );
    vec_s16_t diff0v, diff1v, diff2v, diff3v;
    vec_s16_t temp0v, temp1v, temp2v, temp3v;
    vec_s32_t satdv;

    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);


    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );

    /* Hadamar H */
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );

    VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
                     diff0v, diff1v, diff2v, diff3v );
    /* Hadamar V */
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );

    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );

    satdv = vec_sum2s( satdv, zero_s32v );
    satdv = vec_splat( satdv, 1 );
    vec_ste( satdv, 0, &i_satd );

    return i_satd >> 1;
}

/***********************************************************************
 * SATD 4x8
 **********************************************************************/
static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
                                   uint8_t *pix2, int i_pix2 )
{
    ALIGNED_16( int i_satd );

    PREP_DIFF;
    vec_s16_t diff0v, diff1v, diff2v, diff3v;
    vec_s16_t temp0v, temp1v, temp2v, temp3v;
    vec_s32_t satdv;

    PREP_LOAD_SRC( pix1 );
    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);

    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
                     diff0v, diff1v, diff2v, diff3v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );

    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
                     diff0v, diff1v, diff2v, diff3v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_ADD_ABS( temp0v, satdv,     satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );

    satdv = vec_sum2s( satdv, zero_s32v );
    satdv = vec_splat( satdv, 1 );
    vec_ste( satdv, 0, &i_satd );

    return i_satd >> 1;
}

/***********************************************************************
 * SATD 8x4
 **********************************************************************/
static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
                                   uint8_t *pix2, int i_pix2 )
{
    ALIGNED_16( int i_satd );

    PREP_DIFF;
    vec_s16_t diff0v, diff1v, diff2v, diff3v,
              diff4v, diff5v, diff6v, diff7v;
    vec_s16_t temp0v, temp1v, temp2v, temp3v,
              temp4v, temp5v, temp6v, temp7v;
    vec_s32_t satdv;


    PREP_LOAD_SRC( pix1 );
    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);

    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );

    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    /* This causes warnings because temp4v...temp7v haven't be set,
       but we don't care */
    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diff0v, diff1v, diff2v, diff3v,
                     diff4v, diff5v, diff6v, diff7v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
                 temp4v, temp5v, temp6v, temp7v );

    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );

    satdv = vec_sum2s( satdv, zero_s32v );
    satdv = vec_splat( satdv, 1 );
    vec_ste( satdv, 0, &i_satd );

    return i_satd >> 1;
}

/***********************************************************************
 * SATD 8x8
 **********************************************************************/
static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
                                   uint8_t *pix2, int i_pix2 )
{
    ALIGNED_16( int i_satd );

    PREP_DIFF;
    vec_s16_t diff0v, diff1v, diff2v, diff3v,
              diff4v, diff5v, diff6v, diff7v;
    vec_s16_t temp0v, temp1v, temp2v, temp3v,
              temp4v, temp5v, temp6v, temp7v;
    vec_s32_t satdv;

    vec_u8_t _offset1_1v_ = vec_lvsl(0, pix1);
    vec_u8_t _offset1_2v_ = vec_lvsl(0, pix1 + i_pix1);
    vec_u8_t _offset2_1v_ = vec_lvsl(0, pix2);
    vec_u8_t _offset2_2v_ = vec_lvsl(0, pix2 + i_pix2);

    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1_1v, offset2_1v );
    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset1_2v, offset2_2v );
    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1_1v, offset2_1v );
    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset1_2v, offset2_2v );
    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1_1v, offset2_1v );
    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset1_2v, offset2_2v );
    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1_1v, offset2_1v );
    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset1_2v, offset2_2v );

    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
                 temp4v, temp5v, temp6v, temp7v );

    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diff0v, diff1v, diff2v, diff3v,
                     diff4v, diff5v, diff6v, diff7v );

    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
                 temp4v, temp5v, temp6v, temp7v );

    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );

    satdv = vec_sums( satdv, zero_s32v );
    satdv = vec_splat( satdv, 3 );
    vec_ste( satdv, 0, &i_satd );

    return i_satd >> 1;
}

/***********************************************************************
 * SATD 8x16
 **********************************************************************/
static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
                                    uint8_t *pix2, int i_pix2 )
{
    ALIGNED_16( int i_satd );

    PREP_DIFF;
    vec_s16_t diff0v, diff1v, diff2v, diff3v,
              diff4v, diff5v, diff6v, diff7v;
    vec_s16_t temp0v, temp1v, temp2v, temp3v,
              temp4v, temp5v, temp6v, temp7v;
    vec_s32_t satdv;

    PREP_LOAD_SRC( pix1 );
    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);

    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v);
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diff0v, diff1v, diff2v, diff3v,
                     diff4v, diff5v, diff6v, diff7v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );

    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diff0v, diff1v, diff2v, diff3v,
                     diff4v, diff5v, diff6v, diff7v );
    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_ADD_ABS( temp0v, satdv,     satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );

    satdv = vec_sums( satdv, zero_s32v );
    satdv = vec_splat( satdv, 3 );
    vec_ste( satdv, 0, &i_satd );

    return i_satd >> 1;
}

/***********************************************************************
 * SATD 16x8
 **********************************************************************/
static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
                                    uint8_t *pix2, int i_pix2 )
{
    ALIGNED_16( int i_satd );

    LOAD_ZERO;
    PREP_LOAD;
    PREP_LOAD_SRC( pix2 );
    vec_s32_t satdv;
    vec_s16_t pix1v, pix2v;
    vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
              diffh4v, diffh5v, diffh6v, diffh7v;
    vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
              diffl4v, diffl5v, diffl6v, diffl7v;
    vec_s16_t temp0v, temp1v, temp2v, temp3v,
              temp4v, temp5v, temp6v, temp7v;

    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );

    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
                 temp4v, temp5v, temp6v, temp7v );

    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diffh0v, diffh1v, diffh2v, diffh3v,
                     diffh4v, diffh5v, diffh6v, diffh7v );

    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
                 temp4v, temp5v, temp6v, temp7v );

    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );

    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
                 temp4v, temp5v, temp6v, temp7v );

    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diffl0v, diffl1v, diffl2v, diffl3v,
                     diffl4v, diffl5v, diffl6v, diffl7v );

    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
                 temp4v, temp5v, temp6v, temp7v );

    VEC_ADD_ABS( temp0v, satdv,     satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );

    satdv = vec_sums( satdv, zero_s32v );
    satdv = vec_splat( satdv, 3 );
    vec_ste( satdv, 0, &i_satd );

    return i_satd >> 1;
}

/***********************************************************************
 * SATD 16x16
 **********************************************************************/
static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
                                     uint8_t *pix2, int i_pix2 )
{
    ALIGNED_16( int i_satd );

    LOAD_ZERO;
    PREP_LOAD;
    vec_s32_t satdv;
    vec_s16_t pix1v, pix2v;
    vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
              diffh4v, diffh5v, diffh6v, diffh7v;
    vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
              diffl4v, diffl5v, diffl6v, diffl7v;
    vec_s16_t temp0v, temp1v, temp2v, temp3v,
              temp4v, temp5v, temp6v, temp7v;
    PREP_LOAD_SRC( pix2 );


    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diffh0v, diffh1v, diffh2v, diffh3v,
                     diffh4v, diffh5v, diffh6v, diffh7v );
    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );
    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diffl0v, diffl1v, diffl2v, diffl3v,
                     diffl4v, diffl5v, diffl6v, diffl7v );
    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_ADD_ABS( temp0v, satdv,     satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );

    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diffh0v, diffh1v, diffh2v, diffh3v,
                     diffh4v, diffh5v, diffh6v, diffh7v );
    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_ADD_ABS( temp0v, satdv,     satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );
    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
                     temp4v, temp5v, temp6v, temp7v,
                     diffl0v, diffl1v, diffl2v, diffl3v,
                     diffl4v, diffl5v, diffl6v, diffl7v );
    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
                 temp0v, temp1v, temp2v, temp3v );
    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
                 temp4v, temp5v, temp6v, temp7v );
    VEC_ADD_ABS( temp0v, satdv,     satdv );
    VEC_ADD_ABS( temp1v, satdv,     satdv );
    VEC_ADD_ABS( temp2v, satdv,     satdv );
    VEC_ADD_ABS( temp3v, satdv,     satdv );
    VEC_ADD_ABS( temp4v, satdv,     satdv );
    VEC_ADD_ABS( temp5v, satdv,     satdv );
    VEC_ADD_ABS( temp6v, satdv,     satdv );
    VEC_ADD_ABS( temp7v, satdv,     satdv );

    satdv = vec_sums( satdv, zero_s32v );
    satdv = vec_splat( satdv, 3 );
    vec_ste( satdv, 0, &i_satd );

    return i_satd >> 1;
}



/***********************************************************************
* Interleaved SAD routines
**********************************************************************/

static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
                                        uint8_t *pix0, uint8_t *pix1,
                                        uint8_t *pix2, uint8_t *pix3,
                                        int i_stride, int scores[4] )
{
    ALIGNED_16( int sum0 );
    ALIGNED_16( int sum1 );
    ALIGNED_16( int sum2 );
    ALIGNED_16( int sum3 );

    LOAD_ZERO;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
    //vec_u8_t perm0v, perm1v, perm2v, perm3v;
    vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;

    vec_s32_t sum0v, sum1v, sum2v, sum3v;

    sum0v = vec_splat_s32(0);
    sum1v = vec_splat_s32(0);
    sum2v = vec_splat_s32(0);
    sum3v = vec_splat_s32(0);

    perm0vA = vec_lvsl(0, pix0);
    perm1vA = vec_lvsl(0, pix1);
    perm2vA = vec_lvsl(0, pix2);
    perm3vA = vec_lvsl(0, pix3);

    perm0vB = vec_lvsl(0, pix0 + i_stride);
    perm1vB = vec_lvsl(0, pix1 + i_stride);
    perm2vB = vec_lvsl(0, pix2 + i_stride);
    perm3vB = vec_lvsl(0, pix3 + i_stride);

    for( int y = 0; y < 8; y++ )
    {
        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
        pix1 += i_stride;

        fencv = vec_ld(0, fenc);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
        pix2 += i_stride;

        temp_lv = vec_ld(0, pix3);
        temp_hv = vec_ld(16, pix3);
        pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
        pix3 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );

        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
        pix1 += i_stride;

        fencv = vec_ld(0, fenc);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
        pix2 += i_stride;

        temp_lv = vec_ld(0, pix3);
        temp_hv = vec_ld(16, pix3);
        pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
        pix3 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
    }

    sum0v = vec_sums( sum0v, zero_s32v );
    sum1v = vec_sums( sum1v, zero_s32v );
    sum2v = vec_sums( sum2v, zero_s32v );
    sum3v = vec_sums( sum3v, zero_s32v );

    sum0v = vec_splat( sum0v, 3 );
    sum1v = vec_splat( sum1v, 3 );
    sum2v = vec_splat( sum2v, 3 );
    sum3v = vec_splat( sum3v, 3 );

    vec_ste( sum0v, 0, &sum0);
    vec_ste( sum1v, 0, &sum1);
    vec_ste( sum2v, 0, &sum2);
    vec_ste( sum3v, 0, &sum3);

    scores[0] = sum0;
    scores[1] = sum1;
    scores[2] = sum2;
    scores[3] = sum3;
}

static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
                                        uint8_t *pix1, uint8_t *pix2,
                                        int i_stride, int scores[3] )
{
    ALIGNED_16( int sum0 );
    ALIGNED_16( int sum1 );
    ALIGNED_16( int sum2 );

    LOAD_ZERO;
    vec_u8_t temp_lv, temp_hv; // temporary load vectors
    vec_u8_t fencv, pix0v, pix1v, pix2v;
    vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;

    vec_s32_t sum0v, sum1v, sum2v;

    sum0v = vec_splat_s32(0);
    sum1v = vec_splat_s32(0);
    sum2v = vec_splat_s32(0);

    perm0vA = vec_lvsl(0, pix0);
    perm1vA = vec_lvsl(0, pix1);
    perm2vA = vec_lvsl(0, pix2);

    perm0vB = vec_lvsl(0, pix0 + i_stride);
    perm1vB = vec_lvsl(0, pix1 + i_stride);
    perm2vB = vec_lvsl(0, pix2 + i_stride);

    for( int y = 0; y < 8; y++ )
    {
        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
        pix1 += i_stride;

        fencv = vec_ld(0, fenc);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
        pix2 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );

        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
        pix0 += i_stride;


        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
        pix1 += i_stride;

        fencv = vec_ld(0, fenc);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
        pix2 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
    }

    sum0v = vec_sums( sum0v, zero_s32v );
    sum1v = vec_sums( sum1v, zero_s32v );
    sum2v = vec_sums( sum2v, zero_s32v );

    sum0v = vec_splat( sum0v, 3 );
    sum1v = vec_splat( sum1v, 3 );
    sum2v = vec_splat( sum2v, 3 );

    vec_ste( sum0v, 0, &sum0);
    vec_ste( sum1v, 0, &sum1);
    vec_ste( sum2v, 0, &sum2);

    scores[0] = sum0;
    scores[1] = sum1;
    scores[2] = sum2;
}

static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
{
    ALIGNED_16( int sum0 );
    ALIGNED_16( int sum1 );
    ALIGNED_16( int sum2 );
    ALIGNED_16( int sum3 );

    LOAD_ZERO;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
    vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;

    vec_s32_t sum0v, sum1v, sum2v, sum3v;

    sum0v = vec_splat_s32(0);
    sum1v = vec_splat_s32(0);
    sum2v = vec_splat_s32(0);
    sum3v = vec_splat_s32(0);

    perm0vA = vec_lvsl(0, pix0);
    perm1vA = vec_lvsl(0, pix1);
    perm2vA = vec_lvsl(0, pix2);
    perm3vA = vec_lvsl(0, pix3);

    perm0vB = vec_lvsl(0, pix0 + i_stride);
    perm1vB = vec_lvsl(0, pix1 + i_stride);
    perm2vB = vec_lvsl(0, pix2 + i_stride);
    perm3vB = vec_lvsl(0, pix3 + i_stride);

    for( int y = 0; y < 4; y++ )
    {
        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
        pix1 += i_stride;

        fencv = vec_ld(0, fenc);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
        pix2 += i_stride;

        temp_lv = vec_ld(0, pix3);
        temp_hv = vec_ld(16, pix3);
        pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
        pix3 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );

        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
        pix1 += i_stride;

        fencv = vec_ld(0, fenc);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
        pix2 += i_stride;

        temp_lv = vec_ld(0, pix3);
        temp_hv = vec_ld(16, pix3);
        pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
        pix3 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
    }

    sum0v = vec_sums( sum0v, zero_s32v );
    sum1v = vec_sums( sum1v, zero_s32v );
    sum2v = vec_sums( sum2v, zero_s32v );
    sum3v = vec_sums( sum3v, zero_s32v );

    sum0v = vec_splat( sum0v, 3 );
    sum1v = vec_splat( sum1v, 3 );
    sum2v = vec_splat( sum2v, 3 );
    sum3v = vec_splat( sum3v, 3 );

    vec_ste( sum0v, 0, &sum0);
    vec_ste( sum1v, 0, &sum1);
    vec_ste( sum2v, 0, &sum2);
    vec_ste( sum3v, 0, &sum3);

    scores[0] = sum0;
    scores[1] = sum1;
    scores[2] = sum2;
    scores[3] = sum3;
}

static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
                                       uint8_t *pix1, uint8_t *pix2,
                                       int i_stride, int scores[3] )
{
    ALIGNED_16( int sum0 );
    ALIGNED_16( int sum1 );
    ALIGNED_16( int sum2 );

    LOAD_ZERO;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t fencv, pix0v, pix1v, pix2v;
    vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;

    vec_s32_t sum0v, sum1v, sum2v;

    sum0v = vec_splat_s32(0);
    sum1v = vec_splat_s32(0);
    sum2v = vec_splat_s32(0);

    perm0vA = vec_lvsl(0, pix0);
    perm1vA = vec_lvsl(0, pix1);
    perm2vA = vec_lvsl(0, pix2);

    perm0vB = vec_lvsl(0, pix0 + i_stride);
    perm1vB = vec_lvsl(0, pix1 + i_stride);
    perm2vB = vec_lvsl(0, pix2 + i_stride);

    for( int y = 0; y < 4; y++ )
    {
        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
        pix1 += i_stride;

        fencv = vec_ld(0, fenc);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
        pix2 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );

        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
        pix1 += i_stride;

        fencv = vec_ld(0, fenc);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
        pix2 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
    }

    sum0v = vec_sums( sum0v, zero_s32v );
    sum1v = vec_sums( sum1v, zero_s32v );
    sum2v = vec_sums( sum2v, zero_s32v );

    sum0v = vec_splat( sum0v, 3 );
    sum1v = vec_splat( sum1v, 3 );
    sum2v = vec_splat( sum2v, 3 );

    vec_ste( sum0v, 0, &sum0);
    vec_ste( sum1v, 0, &sum1);
    vec_ste( sum2v, 0, &sum2);

    scores[0] = sum0;
    scores[1] = sum1;
    scores[2] = sum2;
}


static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
                                       uint8_t *pix0, uint8_t *pix1,
                                       uint8_t *pix2, uint8_t *pix3,
                                       int i_stride, int scores[4] )
{
    ALIGNED_16( int sum0 );
    ALIGNED_16( int sum1 );
    ALIGNED_16( int sum2 );
    ALIGNED_16( int sum3 );

    LOAD_ZERO;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
    vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;

    vec_s32_t sum0v, sum1v, sum2v, sum3v;

    sum0v = vec_splat_s32(0);
    sum1v = vec_splat_s32(0);
    sum2v = vec_splat_s32(0);
    sum3v = vec_splat_s32(0);

    permEncv = vec_lvsl(0, fenc);
    perm0vA = vec_lvsl(0, pix0);
    perm1vA = vec_lvsl(0, pix1);
    perm2vA = vec_lvsl(0, pix2);
    perm3vA = vec_lvsl(0, pix3);

    perm0vB = vec_lvsl(0, pix0 + i_stride);
    perm1vB = vec_lvsl(0, pix1 + i_stride);
    perm2vB = vec_lvsl(0, pix2 + i_stride);
    perm3vB = vec_lvsl(0, pix3 + i_stride);

    for( int y = 0; y < 8; y++ )
    {
        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
        pix1 += i_stride;

        temp_lv = vec_ld(0, fenc);
        fencv = vec_perm(temp_lv, temp_hv, permEncv);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
        pix2 += i_stride;

        temp_lv = vec_ld(0, pix3);
        temp_hv = vec_ld(16, pix3);
        pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
        pix3 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );

        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
        pix1 += i_stride;

        temp_lv = vec_ld(0, fenc);
        fencv = vec_perm(temp_lv, temp_hv, permEncv);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
        pix2 += i_stride;

        temp_lv = vec_ld(0, pix3);
        temp_hv = vec_ld(16, pix3);
        pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
        pix3 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
    }

    sum0v = vec_sum2s( sum0v, zero_s32v );
    sum1v = vec_sum2s( sum1v, zero_s32v );
    sum2v = vec_sum2s( sum2v, zero_s32v );
    sum3v = vec_sum2s( sum3v, zero_s32v );

    sum0v = vec_splat( sum0v, 1 );
    sum1v = vec_splat( sum1v, 1 );
    sum2v = vec_splat( sum2v, 1 );
    sum3v = vec_splat( sum3v, 1 );

    vec_ste( sum0v, 0, &sum0);
    vec_ste( sum1v, 0, &sum1);
    vec_ste( sum2v, 0, &sum2);
    vec_ste( sum3v, 0, &sum3);

    scores[0] = sum0;
    scores[1] = sum1;
    scores[2] = sum2;
    scores[3] = sum3;
}

static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
                                       uint8_t *pix1, uint8_t *pix2,
                                       int i_stride, int scores[3] )
{
    ALIGNED_16( int sum0 );
    ALIGNED_16( int sum1 );
    ALIGNED_16( int sum2 );

    LOAD_ZERO;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t fencv, pix0v, pix1v, pix2v;
    vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;

    vec_s32_t sum0v, sum1v, sum2v;

    sum0v = vec_splat_s32(0);
    sum1v = vec_splat_s32(0);
    sum2v = vec_splat_s32(0);

    permEncv = vec_lvsl(0, fenc);
    perm0vA = vec_lvsl(0, pix0);
    perm1vA = vec_lvsl(0, pix1);
    perm2vA = vec_lvsl(0, pix2);

    perm0vB = vec_lvsl(0, pix0 + i_stride);
    perm1vB = vec_lvsl(0, pix1 + i_stride);
    perm2vB = vec_lvsl(0, pix2 + i_stride);

    for( int y = 0; y < 8; y++ )
    {
        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
        pix1 += i_stride;

        temp_lv = vec_ld(0, fenc);
        fencv = vec_perm(temp_lv, temp_hv, permEncv);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
        pix2 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );

        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
        pix1 += i_stride;

        temp_lv = vec_ld(0, fenc);
        fencv = vec_perm(temp_lv, temp_hv, permEncv);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
        pix2 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
    }

    sum0v = vec_sum2s( sum0v, zero_s32v );
    sum1v = vec_sum2s( sum1v, zero_s32v );
    sum2v = vec_sum2s( sum2v, zero_s32v );

    sum0v = vec_splat( sum0v, 1 );
    sum1v = vec_splat( sum1v, 1 );
    sum2v = vec_splat( sum2v, 1 );

    vec_ste( sum0v, 0, &sum0);
    vec_ste( sum1v, 0, &sum1);
    vec_ste( sum2v, 0, &sum2);

    scores[0] = sum0;
    scores[1] = sum1;
    scores[2] = sum2;
}

static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
                                      uint8_t *pix0, uint8_t *pix1,
                                      uint8_t *pix2, uint8_t *pix3,
                                      int i_stride, int scores[4] )
{
    ALIGNED_16( int sum0 );
    ALIGNED_16( int sum1 );
    ALIGNED_16( int sum2 );
    ALIGNED_16( int sum3 );

    LOAD_ZERO;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
    vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;

    vec_s32_t sum0v, sum1v, sum2v, sum3v;

    sum0v = vec_splat_s32(0);
    sum1v = vec_splat_s32(0);
    sum2v = vec_splat_s32(0);
    sum3v = vec_splat_s32(0);

    permEncv = vec_lvsl(0, fenc);
    perm0vA = vec_lvsl(0, pix0);
    perm1vA = vec_lvsl(0, pix1);
    perm2vA = vec_lvsl(0, pix2);
    perm3vA = vec_lvsl(0, pix3);

    perm0vB = vec_lvsl(0, pix0 + i_stride);
    perm1vB = vec_lvsl(0, pix1 + i_stride);
    perm2vB = vec_lvsl(0, pix2 + i_stride);
    perm3vB = vec_lvsl(0, pix3 + i_stride);

    for( int y = 0; y < 4; y++ )
    {
        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
        pix1 += i_stride;

        temp_lv = vec_ld(0, fenc);
        fencv = vec_perm(temp_lv, temp_hv, permEncv);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
        pix2 += i_stride;

        temp_lv = vec_ld(0, pix3);
        temp_hv = vec_ld(16, pix3);
        pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
        pix3 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );

        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
        pix1 += i_stride;

        temp_lv = vec_ld(0, fenc);
        fencv = vec_perm(temp_lv, temp_hv, permEncv);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
        pix2 += i_stride;

        temp_lv = vec_ld(0, pix3);
        temp_hv = vec_ld(16, pix3);
        pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
        pix3 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
    }

    sum0v = vec_sum2s( sum0v, zero_s32v );
    sum1v = vec_sum2s( sum1v, zero_s32v );
    sum2v = vec_sum2s( sum2v, zero_s32v );
    sum3v = vec_sum2s( sum3v, zero_s32v );

    sum0v = vec_splat( sum0v, 1 );
    sum1v = vec_splat( sum1v, 1 );
    sum2v = vec_splat( sum2v, 1 );
    sum3v = vec_splat( sum3v, 1 );

    vec_ste( sum0v, 0, &sum0);
    vec_ste( sum1v, 0, &sum1);
    vec_ste( sum2v, 0, &sum2);
    vec_ste( sum3v, 0, &sum3);

    scores[0] = sum0;
    scores[1] = sum1;
    scores[2] = sum2;
    scores[3] = sum3;
}

static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
                                      uint8_t *pix1, uint8_t *pix2,
                                      int i_stride, int scores[3] )
{
    ALIGNED_16( int sum0 );
    ALIGNED_16( int sum1 );
    ALIGNED_16( int sum2 );

    LOAD_ZERO;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t fencv, pix0v, pix1v, pix2v;
    vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,  permEncv;

    vec_s32_t sum0v, sum1v, sum2v;

    sum0v = vec_splat_s32(0);
    sum1v = vec_splat_s32(0);
    sum2v = vec_splat_s32(0);

    permEncv = vec_lvsl(0, fenc);
    perm0vA = vec_lvsl(0, pix0);
    perm1vA = vec_lvsl(0, pix1);
    perm2vA = vec_lvsl(0, pix2);

    perm0vB = vec_lvsl(0, pix0 + i_stride);
    perm1vB = vec_lvsl(0, pix1 + i_stride);
    perm2vB = vec_lvsl(0, pix2 + i_stride);

    for( int y = 0; y < 4; y++ )
    {
        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
        pix1 += i_stride;

        temp_lv = vec_ld(0, fenc);
        fencv = vec_perm(temp_lv, temp_hv, permEncv);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
        pix2 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );

        temp_lv = vec_ld(0, pix0);
        temp_hv = vec_ld(16, pix0);
        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
        pix0 += i_stride;

        temp_lv = vec_ld(0, pix1);
        temp_hv = vec_ld(16, pix1);
        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
        pix1 += i_stride;

        temp_lv = vec_ld(0, fenc);
        fencv = vec_perm(temp_lv, temp_hv, permEncv);
        fenc += FENC_STRIDE;

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
        pix2 += i_stride;

        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
    }

    sum0v = vec_sum2s( sum0v, zero_s32v );
    sum1v = vec_sum2s( sum1v, zero_s32v );
    sum2v = vec_sum2s( sum2v, zero_s32v );

    sum0v = vec_splat( sum0v, 1 );
    sum1v = vec_splat( sum1v, 1 );
    sum2v = vec_splat( sum2v, 1 );

    vec_ste( sum0v, 0, &sum0);
    vec_ste( sum1v, 0, &sum1);
    vec_ste( sum2v, 0, &sum2);

    scores[0] = sum0;
    scores[1] = sum1;
    scores[2] = sum2;
}

/***********************************************************************
* SSD routines
**********************************************************************/

static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
                                     uint8_t *pix2, int i_stride_pix2)
{
    ALIGNED_16( int sum );

    LOAD_ZERO;
    vec_u8_t  pix1vA, pix2vA, pix1vB, pix2vB;
    vec_u32_t sumv;
    vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t permA, permB;

    sumv = vec_splat_u32(0);

    permA = vec_lvsl(0, pix2);
    permB = vec_lvsl(0, pix2 + i_stride_pix2);

    temp_lv = vec_ld(0, pix2);
    temp_hv = vec_ld(16, pix2);
    pix2vA = vec_perm(temp_lv, temp_hv, permA);
    pix1vA = vec_ld(0, pix1);

    for( int y = 0; y < 7; y++ )
    {
        pix1 += i_stride_pix1;
        pix2 += i_stride_pix2;

        maxA = vec_max(pix1vA, pix2vA);
        minA = vec_min(pix1vA, pix2vA);

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2vB = vec_perm(temp_lv, temp_hv, permB);
        pix1vB = vec_ld(0, pix1);

        diffA = vec_sub(maxA, minA);
        sumv = vec_msum(diffA, diffA, sumv);

        pix1 += i_stride_pix1;
        pix2 += i_stride_pix2;

        maxB = vec_max(pix1vB, pix2vB);
        minB = vec_min(pix1vB, pix2vB);

        temp_lv = vec_ld(0, pix2);
        temp_hv = vec_ld(16, pix2);
        pix2vA = vec_perm(temp_lv, temp_hv, permA);
        pix1vA = vec_ld(0, pix1);

        diffB = vec_sub(maxB, minB);
        sumv = vec_msum(diffB, diffB, sumv);
    }

    pix1 += i_stride_pix1;
    pix2 += i_stride_pix2;

    temp_lv = vec_ld(0, pix2);
    temp_hv = vec_ld(16, pix2);
    pix2vB = vec_perm(temp_lv, temp_hv, permB);
    pix1vB = vec_ld(0, pix1);

    maxA = vec_max(pix1vA, pix2vA);
    minA = vec_min(pix1vA, pix2vA);

    maxB = vec_max(pix1vB, pix2vB);
    minB = vec_min(pix1vB, pix2vB);

    diffA = vec_sub(maxA, minA);
    sumv = vec_msum(diffA, diffA, sumv);

    diffB = vec_sub(maxB, minB);
    sumv = vec_msum(diffB, diffB, sumv);

    sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
    sumv = vec_splat(sumv, 3);
    vec_ste((vec_s32_t) sumv, 0, &sum);
    return sum;
}

static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
                                   uint8_t *pix2, int i_stride_pix2)
{
    ALIGNED_16( int sum );

    LOAD_ZERO;
    vec_u8_t  pix1v, pix2v;
    vec_u32_t sumv;
    vec_u8_t maxv, minv, diffv;
    vec_u8_t temp_lv, temp_hv;
    vec_u8_t perm1v, perm2v;

    const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);

    sumv = vec_splat_u32(0);

    perm1v = vec_lvsl(0, pix1);
    perm2v = vec_lvsl(0, pix2);

    for( int y = 0; y < 8; y++ )
    {
        temp_hv = vec_ld(0, pix1);
        temp_lv = vec_ld(7, pix1);
        pix1v = vec_perm(temp_hv, temp_lv, perm1v);

        temp_hv = vec_ld(0, pix2);
        temp_lv = vec_ld(7, pix2);
        pix2v = vec_perm(temp_hv, temp_lv, perm2v);

        maxv = vec_max(pix1v, pix2v);
        minv = vec_min(pix1v, pix2v);

        diffv = vec_sub(maxv, minv);
        sumv = vec_msum(diffv, diffv, sumv);

        pix1 += i_stride_pix1;
        pix2 += i_stride_pix2;
    }

    sumv = vec_sel( zero_u32v, sumv, sel );

    sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
    sumv = vec_splat(sumv, 3);
    vec_ste((vec_s32_t) sumv, 0, &sum);

    return sum;
}


/****************************************************************************
 * variance
 ****************************************************************************/
static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
{
    ALIGNED_16(uint32_t sum_tab[4]);
    ALIGNED_16(uint32_t sqr_tab[4]);

    LOAD_ZERO;
    vec_u32_t sqr_v = zero_u32v;
    vec_u32_t sum_v = zero_u32v;

    for( int y = 0; y < 16; y++ )
    {
        vec_u8_t pix0_v = vec_ld(0, pix);
        sum_v = vec_sum4s(pix0_v, sum_v);
        sqr_v = vec_msum(pix0_v, pix0_v, sqr_v);

        pix += i_stride;
    }
    sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
    sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
    vec_ste(sum_v, 12, sum_tab);
    vec_ste(sqr_v, 12, sqr_tab);

    uint32_t sum = sum_tab[3];
    uint32_t sqr = sqr_tab[3];
    return sum + ((uint64_t)sqr<<32);
}

static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
{
    ALIGNED_16(uint32_t sum_tab[4]);
    ALIGNED_16(uint32_t sqr_tab[4]);

    LOAD_ZERO;
    vec_u32_t sqr_v = zero_u32v;
    vec_u32_t sum_v = zero_u32v;

    static const vec_u8_t perm_tab[] =
    {
        CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,  /* pix=mod16, i_stride=mod16 */
           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17),
        CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,  /* pix=mod8, i_stride=mod16  */
           0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F),
    };
    vec_u8_t perm = perm_tab[ ((uintptr_t)pix & 8) >> 3 ];

    for( int y = 0; y < 4; y++ )
    {
        vec_u8_t pix0_v = vec_ld(0, pix);
        vec_u8_t pix1_v = vec_ld(i_stride, pix);
        vec_u8_t pix_v = vec_perm(pix0_v, pix1_v, perm);
        sum_v = vec_sum4s(pix_v, sum_v);
        sqr_v = vec_msum(pix_v, pix_v, sqr_v);

        pix += i_stride<<1;
    }
    sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
    sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
    vec_ste(sum_v, 12, sum_tab);
    vec_ste(sqr_v, 12, sqr_tab);

    uint32_t sum = sum_tab[3];
    uint32_t sqr = sqr_tab[3];
    return sum + ((uint64_t)sqr<<32);
}


/**********************************************************************
 * SA8D routines: sum of 8x8 Hadamard transformed differences
 **********************************************************************/
/* SA8D_1D unrolled by 8 in Altivec */
#define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v,  \
                         sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
{                                                         \
    /* int    a0  =        SRC(0) + SRC(4) */             \
    vec_s16_t a0v = vec_add(sa8d0v, sa8d4v);              \
    /* int    a4  =        SRC(0) - SRC(4) */             \
    vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v);              \
    /* int    a1  =        SRC(1) + SRC(5) */             \
    vec_s16_t a1v = vec_add(sa8d1v, sa8d5v);              \
    /* int    a5  =        SRC(1) - SRC(5) */             \
    vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v);              \
    /* int    a2  =        SRC(2) + SRC(6) */             \
    vec_s16_t a2v = vec_add(sa8d2v, sa8d6v);              \
    /* int    a6  =        SRC(2) - SRC(6) */             \
    vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v);              \
    /* int    a3  =        SRC(3) + SRC(7) */             \
    vec_s16_t a3v = vec_add(sa8d3v, sa8d7v);              \
    /* int    a7  =        SRC(3) - SRC(7) */             \
    vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v);              \
                                                          \
    /* int    b0  =         a0 + a2  */                   \
    vec_s16_t b0v = vec_add(a0v, a2v);                    \
    /* int    b2  =         a0 - a2; */                   \
    vec_s16_t  b2v = vec_sub(a0v, a2v);                   \
    /* int    b1  =         a1 + a3; */                   \
    vec_s16_t b1v = vec_add(a1v, a3v);                    \
    /* int    b3  =         a1 - a3; */                   \
    vec_s16_t b3v = vec_sub(a1v, a3v);                    \
    /* int    b4  =         a4 + a6; */                   \
    vec_s16_t b4v = vec_add(a4v, a6v);                    \
    /* int    b6  =         a4 - a6; */                   \
    vec_s16_t b6v = vec_sub(a4v, a6v);                    \
    /* int    b5  =         a5 + a7; */                   \
    vec_s16_t b5v = vec_add(a5v, a7v);                    \
    /* int    b7  =         a5 - a7; */                   \
    vec_s16_t b7v = vec_sub(a5v, a7v);                    \
                                                          \
    /* DST(0,        b0 + b1) */                          \
    sa8d0v = vec_add(b0v, b1v);                           \
    /* DST(1,        b0 - b1) */                          \
    sa8d1v = vec_sub(b0v, b1v);                           \
    /* DST(2,        b2 + b3) */                          \
    sa8d2v = vec_add(b2v, b3v);                           \
    /* DST(3,        b2 - b3) */                          \
    sa8d3v = vec_sub(b2v, b3v);                           \
    /* DST(4,        b4 + b5) */                          \
    sa8d4v = vec_add(b4v, b5v);                           \
    /* DST(5,        b4 - b5) */                          \
    sa8d5v = vec_sub(b4v, b5v);                           \
    /* DST(6,        b6 + b7) */                          \
    sa8d6v = vec_add(b6v, b7v);                           \
    /* DST(7,        b6 - b7) */                          \
    sa8d7v = vec_sub(b6v, b7v);                           \
}

static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1,
                                        uint8_t *pix2, int i_pix2 )
{
    int32_t i_satd=0;

    PREP_DIFF;
    PREP_LOAD_SRC( pix1 );
    PREP_LOAD_SRC( pix2 );

    vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;

    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 );

    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 );
    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 );

    vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;

    SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
                    diff4v, diff5v, diff6v, diff7v);

    VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
                    diff4v, diff5v, diff6v, diff7v,
                    sa8d0v, sa8d1v, sa8d2v, sa8d3v,
                    sa8d4v, sa8d5v, sa8d6v, sa8d7v );

    SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
                    sa8d4v, sa8d5v, sa8d6v, sa8d7v );

    /* accumulation of the absolute value of all elements of the resulting bloc */
    vec_s16_t abs0v = VEC_ABS(sa8d0v);
    vec_s16_t abs1v = VEC_ABS(sa8d1v);
    vec_s16_t sum01v = vec_add(abs0v, abs1v);

    vec_s16_t abs2v = VEC_ABS(sa8d2v);
    vec_s16_t abs3v = VEC_ABS(sa8d3v);
    vec_s16_t sum23v = vec_add(abs2v, abs3v);

    vec_s16_t abs4v = VEC_ABS(sa8d4v);
    vec_s16_t abs5v = VEC_ABS(sa8d5v);
    vec_s16_t sum45v = vec_add(abs4v, abs5v);

    vec_s16_t abs6v = VEC_ABS(sa8d6v);
    vec_s16_t abs7v = VEC_ABS(sa8d7v);
    vec_s16_t sum67v = vec_add(abs6v, abs7v);

    vec_s16_t sum0123v = vec_add(sum01v, sum23v);
    vec_s16_t sum4567v = vec_add(sum45v, sum67v);

    vec_s32_t sumblocv;

    sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
    sumblocv = vec_sum4s(sum4567v, sumblocv );

    sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );

    sumblocv = vec_splat(sumblocv, 3);

    vec_ste(sumblocv, 0, &i_satd);

    return i_satd;
}

static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1,
                                   uint8_t *pix2, int i_pix2 )
{
    int32_t i_satd;
    i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
    return i_satd;
}

static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
                                     uint8_t *pix2, int i_pix2 )
{
    int32_t i_satd;

    i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0],     i_pix1, &pix2[0],     i_pix2 )
            + pixel_sa8d_8x8_core_altivec( &pix1[8],     i_pix1, &pix2[8],     i_pix2 )
            + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1],   i_pix1, &pix2[8*i_pix2],   i_pix2 )
            + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
    return i_satd;
}

#define HADAMARD4_ALTIVEC(d0,d1,d2,d3,s0,s1,s2,s3) {\
    vec_s16_t t0 = vec_add(s0, s1);                 \
    vec_s16_t t1 = vec_sub(s0, s1);                 \
    vec_s16_t t2 = vec_add(s2, s3);                 \
    vec_s16_t t3 = vec_sub(s2, s3);                 \
    d0 = vec_add(t0, t2);                           \
    d2 = vec_sub(t0, t2);                           \
    d1 = vec_add(t1, t3);                           \
    d3 = vec_sub(t1, t3);                           \
}

#define VEC_LOAD_HIGH( p, num )                                    \
    vec_u8_t pix8_##num = vec_ld( stride*num, p );                 \
    vec_s16_t pix16_s##num = (vec_s16_t)vec_perm(pix8_##num, zero_u8v, perm); \
    vec_s16_t pix16_d##num;

static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm )
{
    ALIGNED_16( int32_t sum4_tab[4] );
    ALIGNED_16( int32_t sum8_tab[4] );
    LOAD_ZERO;

    VEC_LOAD_HIGH( pix, 0 );
    VEC_LOAD_HIGH( pix, 1 );
    VEC_LOAD_HIGH( pix, 2 );
    VEC_LOAD_HIGH( pix, 3 );
    HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
                      pix16_s0,pix16_s1,pix16_s2,pix16_s3);

    VEC_LOAD_HIGH( pix, 4 );
    VEC_LOAD_HIGH( pix, 5 );
    VEC_LOAD_HIGH( pix, 6 );
    VEC_LOAD_HIGH( pix, 7 );
    HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
                      pix16_s4,pix16_s5,pix16_s6,pix16_s7);

    VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
                    pix16_d4, pix16_d5, pix16_d6, pix16_d7,
                    pix16_s0, pix16_s1, pix16_s2, pix16_s3,
                    pix16_s4, pix16_s5, pix16_s6, pix16_s7);

    HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
                      pix16_s0,pix16_s1,pix16_s2,pix16_s3);

    HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
                      pix16_s4,pix16_s5,pix16_s6,pix16_s7);

    vec_u16_t addabs01 = vec_add( VEC_ABSOLUTE(pix16_d0), VEC_ABSOLUTE(pix16_d1) );
    vec_u16_t addabs23 = vec_add( VEC_ABSOLUTE(pix16_d2), VEC_ABSOLUTE(pix16_d3) );
    vec_u16_t addabs45 = vec_add( VEC_ABSOLUTE(pix16_d4), VEC_ABSOLUTE(pix16_d5) );
    vec_u16_t addabs67 = vec_add( VEC_ABSOLUTE(pix16_d6), VEC_ABSOLUTE(pix16_d7) );

    vec_u16_t sum4_v = vec_add(vec_add(addabs01, addabs23), vec_add(addabs45, addabs67));
    vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum4_v, zero_s32v), zero_s32v), 12, sum4_tab);

    vec_s16_t tmpi0 = vec_add(pix16_d0, pix16_d4);
    vec_s16_t tmpi4 = vec_sub(pix16_d0, pix16_d4);
    vec_s16_t tmpi1 = vec_add(pix16_d1, pix16_d5);
    vec_s16_t tmpi5 = vec_sub(pix16_d1, pix16_d5);
    vec_s16_t tmpi2 = vec_add(pix16_d2, pix16_d6);
    vec_s16_t tmpi6 = vec_sub(pix16_d2, pix16_d6);
    vec_s16_t tmpi3 = vec_add(pix16_d3, pix16_d7);
    vec_s16_t tmpi7 = vec_sub(pix16_d3, pix16_d7);

    int sum4 = sum4_tab[3];

    VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3,
                    tmpi4, tmpi5, tmpi6, tmpi7,
                    pix16_d0, pix16_d1, pix16_d2, pix16_d3,
                    pix16_d4, pix16_d5, pix16_d6, pix16_d7);

    vec_u16_t addsum04 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d0, pix16_d4) ),
                                  VEC_ABSOLUTE( vec_sub(pix16_d0, pix16_d4) ) );
    vec_u16_t addsum15 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d1, pix16_d5) ),
                                  VEC_ABSOLUTE( vec_sub(pix16_d1, pix16_d5) ) );
    vec_u16_t addsum26 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d2, pix16_d6) ),
                                  VEC_ABSOLUTE( vec_sub(pix16_d2, pix16_d6) ) );
    vec_u16_t addsum37 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d3, pix16_d7) ),
                                  VEC_ABSOLUTE( vec_sub(pix16_d3, pix16_d7) ) );

    vec_u16_t sum8_v = vec_add( vec_add(addsum04, addsum15), vec_add(addsum26, addsum37) );
    vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum8_v, zero_s32v), zero_s32v), 12, sum8_tab);

    int sum8 = sum8_tab[3];

    ALIGNED_16( int16_t tmp0_4_tab[8] );
    vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);

    sum4 -= tmp0_4_tab[0];
    sum8 -= tmp0_4_tab[0];
    return ((uint64_t)sum8<<32) + sum4;
}


static const vec_u8_t hadamard_permtab[] =
{
    CV(0x10,0x00,0x11,0x01, 0x12,0x02,0x13,0x03,     /* pix = mod16 */
       0x14,0x04,0x15,0x05, 0x16,0x06,0x17,0x07 ),
    CV(0x18,0x08,0x19,0x09, 0x1A,0x0A,0x1B,0x0B,     /* pix = mod8 */
       0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F )
 };

static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, int stride )
{
    int idx =  ((uintptr_t)pix & 8) >> 3;
    vec_u8_t permh = hadamard_permtab[idx];
    vec_u8_t perml = hadamard_permtab[!idx];
    uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
    sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
    sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, permh );
    sum += pixel_hadamard_ac_altivec( pix+8*stride+8, stride, perml );
    return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}

static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, int stride )
{
    int idx =  ((uintptr_t)pix & 8) >> 3;
    vec_u8_t permh = hadamard_permtab[idx];
    vec_u8_t perml = hadamard_permtab[!idx];
    uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
    sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
    return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}

static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, int stride )
{
    vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
    uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
    sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, perm );
    return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}

static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, int stride )
{
    vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
    uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
    return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}


/****************************************************************************
 * structural similarity metric
 ****************************************************************************/
static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
                                     const uint8_t *pix2, int stride2,
                                     int sums[2][4] )
{
    ALIGNED_16( int temp[4] );

    vec_u8_t pix1v, pix2v;
    vec_u32_t s1v, s2v, ssv, s12v;
    PREP_LOAD;
    PREP_LOAD_SRC (pix1);
    PREP_LOAD_SRC (pix2);
    LOAD_ZERO;

    s1v = s2v = ssv = s12v = zero_u32v;

    for( int y = 0; y < 4; y++ )
    {
        VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 );
        VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 );

        s1v = vec_sum4s( pix1v, s1v );
        s2v = vec_sum4s( pix2v, s2v );
        ssv = vec_msum( pix1v, pix1v, ssv );
        ssv = vec_msum( pix2v, pix2v, ssv );
        s12v = vec_msum( pix1v, pix2v, s12v );
    }

    vec_st( (vec_s32_t)s1v, 0, temp );
    sums[0][0] = temp[0];
    sums[1][0] = temp[1];
    vec_st( (vec_s32_t)s2v, 0, temp );
    sums[0][1] = temp[0];
    sums[1][1] = temp[1];
    vec_st( (vec_s32_t)ssv, 0, temp );
    sums[0][2] = temp[0];
    sums[1][2] = temp[1];
    vec_st( (vec_s32_t)s12v, 0, temp );
    sums[0][3] = temp[0];
    sums[1][3] = temp[1];
}

#define SATD_X( size ) \
static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
{\
    scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
}\
static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
{\
    scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
    scores[3] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix3, i_stride );\
}
SATD_X( 16x16 )\
SATD_X( 16x8 )\
SATD_X( 8x16 )\
SATD_X( 8x8 )\
SATD_X( 8x4 )\
SATD_X( 4x8 )\
SATD_X( 4x4 )


#define INTRA_MBCMP_8x8( mbcmp )\
void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[36], int res[3] )\
{\
    ALIGNED_8( uint8_t pix[8*FDEC_STRIDE] );\
    x264_predict_8x8_v_c( pix, edge );\
    res[0] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_8x8_h_c( pix, edge );\
    res[1] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_8x8_dc_c( pix, edge );\
    res[2] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
}

INTRA_MBCMP_8x8(sad)
INTRA_MBCMP_8x8(sa8d)

#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\
void intra_##mbcmp##_x3_##size##x##size##chroma##_altivec( uint8_t *fenc, uint8_t *fdec, int res[3] )\
{\
    x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\
    res[0] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\
    res[1] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\
    res[2] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
}

INTRA_MBCMP(satd, 4, v, h, dc, )
INTRA_MBCMP(sad, 8, dc, h, v, c )
INTRA_MBCMP(satd, 8, dc, h, v, c )
INTRA_MBCMP(sad, 16, v, h, dc, )
INTRA_MBCMP(satd, 16, v, h, dc, )
#endif // !HIGH_BIT_DEPTH

/****************************************************************************
 * x264_pixel_init:
 ****************************************************************************/
void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
{
#if !HIGH_BIT_DEPTH
    pixf->sad[PIXEL_16x16]  = pixel_sad_16x16_altivec;
    pixf->sad[PIXEL_8x16]   = pixel_sad_8x16_altivec;
    pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
    pixf->sad[PIXEL_8x8]    = pixel_sad_8x8_altivec;

    pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
    pixf->sad_x3[PIXEL_8x16]  = pixel_sad_x3_8x16_altivec;
    pixf->sad_x3[PIXEL_16x8]  = pixel_sad_x3_16x8_altivec;
    pixf->sad_x3[PIXEL_8x8]   = pixel_sad_x3_8x8_altivec;

    pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
    pixf->sad_x4[PIXEL_8x16]  = pixel_sad_x4_8x16_altivec;
    pixf->sad_x4[PIXEL_16x8]  = pixel_sad_x4_16x8_altivec;
    pixf->sad_x4[PIXEL_8x8]   = pixel_sad_x4_8x8_altivec;

    pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
    pixf->satd[PIXEL_8x16]  = pixel_satd_8x16_altivec;
    pixf->satd[PIXEL_16x8]  = pixel_satd_16x8_altivec;
    pixf->satd[PIXEL_8x8]   = pixel_satd_8x8_altivec;
    pixf->satd[PIXEL_8x4]   = pixel_satd_8x4_altivec;
    pixf->satd[PIXEL_4x8]   = pixel_satd_4x8_altivec;
    pixf->satd[PIXEL_4x4]   = pixel_satd_4x4_altivec;

    pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_altivec;
    pixf->satd_x3[PIXEL_8x16]  = pixel_satd_x3_8x16_altivec;
    pixf->satd_x3[PIXEL_16x8]  = pixel_satd_x3_16x8_altivec;
    pixf->satd_x3[PIXEL_8x8]   = pixel_satd_x3_8x8_altivec;
    pixf->satd_x3[PIXEL_8x4]   = pixel_satd_x3_8x4_altivec;
    pixf->satd_x3[PIXEL_4x8]   = pixel_satd_x3_4x8_altivec;
    pixf->satd_x3[PIXEL_4x4]   = pixel_satd_x3_4x4_altivec;

    pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_altivec;
    pixf->satd_x4[PIXEL_8x16]  = pixel_satd_x4_8x16_altivec;
    pixf->satd_x4[PIXEL_16x8]  = pixel_satd_x4_16x8_altivec;
    pixf->satd_x4[PIXEL_8x8]   = pixel_satd_x4_8x8_altivec;
    pixf->satd_x4[PIXEL_8x4]   = pixel_satd_x4_8x4_altivec;
    pixf->satd_x4[PIXEL_4x8]   = pixel_satd_x4_4x8_altivec;
    pixf->satd_x4[PIXEL_4x4]   = pixel_satd_x4_4x4_altivec;

    pixf->intra_sad_x3_8x8    = intra_sad_x3_8x8_altivec;
    pixf->intra_sad_x3_8x8c   = intra_sad_x3_8x8c_altivec;
    pixf->intra_sad_x3_16x16  = intra_sad_x3_16x16_altivec;

    pixf->intra_satd_x3_4x4   = intra_satd_x3_4x4_altivec;
    pixf->intra_satd_x3_8x8c  = intra_satd_x3_8x8c_altivec;
    pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_altivec;

    pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
    pixf->ssd[PIXEL_8x8]   = pixel_ssd_8x8_altivec;

    pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
    pixf->sa8d[PIXEL_8x8]   = pixel_sa8d_8x8_altivec;

    pixf->intra_sa8d_x3_8x8   = intra_sa8d_x3_8x8_altivec;

    pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_altivec;
    pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_altivec;

    pixf->hadamard_ac[PIXEL_16x16] = x264_pixel_hadamard_ac_16x16_altivec;
    pixf->hadamard_ac[PIXEL_16x8]  = x264_pixel_hadamard_ac_16x8_altivec;
    pixf->hadamard_ac[PIXEL_8x16]  = x264_pixel_hadamard_ac_8x16_altivec;
    pixf->hadamard_ac[PIXEL_8x8]   = x264_pixel_hadamard_ac_8x8_altivec;

    pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
#endif // !HIGH_BIT_DEPTH
}
x264-snapshot-20120103-2245-stable/common/ppc/mc.h0000644000175000001440000000245211700673342020516 0ustar  videolanusers/*****************************************************************************
 * mc.h: ppc motion compensation
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Eric Petit <eric.petit@lapsus.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_PPC_MC_H
#define X264_PPC_MC_H

void x264_mc_altivec_init( x264_mc_functions_t *pf );

#endif
x264-snapshot-20120103-2245-stable/common/ppc/mc.c0000644000175000001440000012433211700673342020513 0ustar  videolanusers/*****************************************************************************
 * mc.c: ppc motion compensation
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Eric Petit <eric.petit@lapsus.org>
 *          Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdarg.h>

#include "x264.h"
#include "common/common.h"
#include "common/mc.h"
#include "mc.h"
#include "ppccommon.h"

#if !HIGH_BIT_DEPTH
typedef void (*pf_mc_t)( uint8_t *src, int i_src,
                         uint8_t *dst, int i_dst, int i_height );


static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};


static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
{
    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
           pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
           pix[ 3*i_pix_next];
}
static inline int x264_tapfilter1( uint8_t *pix )
{
    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
           pix[ 3];
}


static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  int i_dst,
                                               uint8_t *src1, int i_src1,
                                               uint8_t *src2, int i_height )
{
    for( int y = 0; y < i_height; y++ )
    {
        for( int x = 0; x < 4; x++ )
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        dst  += i_dst;
        src1 += i_src1;
        src2 += i_src1;
    }
}

static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst,  int i_dst,
                                               uint8_t *src1, int i_src1,
                                               uint8_t *src2, int i_height )
{
    vec_u8_t src1v, src2v;
    PREP_LOAD;
    PREP_STORE8;
    PREP_LOAD_SRC( src1 );
    PREP_LOAD_SRC( src2 );

    for( int y = 0; y < i_height; y++ )
    {
        VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
        VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
        src1v = vec_avg( src1v, src2v );
        VEC_STORE8( src1v, dst );

        dst  += i_dst;
        src1 += i_src1;
        src2 += i_src1;
    }
}

static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst,  int i_dst,
                                                uint8_t *src1, int i_src1,
                                                uint8_t *src2, int i_height )
{
    vec_u8_t src1v, src2v;
    PREP_LOAD;
    PREP_LOAD_SRC( src1 );
    PREP_LOAD_SRC( src2 );

    for( int y = 0; y < i_height; y++ )
    {
        VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
        VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
        src1v = vec_avg( src1v, src2v );
        vec_st(src1v, 0, dst);

        dst  += i_dst;
        src1 += i_src1;
        src2 += i_src1;
    }
}

static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst,  int i_dst,
                                                uint8_t *src1, int i_src1,
                                                uint8_t *src2, int i_height )
{
    x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
    x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
}

/* mc_copy: plain c */

#define MC_COPY( name, a )                                \
static void name( uint8_t *dst, int i_dst,                \
                  uint8_t *src, int i_src, int i_height ) \
{                                                         \
    int y;                                                \
    for( y = 0; y < i_height; y++ )                       \
    {                                                     \
        memcpy( dst, src, a );                            \
        src += i_src;                                     \
        dst += i_dst;                                     \
    }                                                     \
}
MC_COPY( x264_mc_copy_w4_altivec,  4  )
MC_COPY( x264_mc_copy_w8_altivec,  8  )

static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
                                      uint8_t *src, int i_src, int i_height )
{
    vec_u8_t cpyV;
    PREP_LOAD;
    PREP_LOAD_SRC( src );

    for( int y = 0; y < i_height; y++)
    {
        VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
        vec_st(cpyV, 0, dst);

        src += i_src;
        dst += i_dst;
    }
}


static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, int i_dst,
                                              uint8_t *src, int i_src, int i_height )
{
    for( int y = 0; y < i_height; ++y)
    {
        vec_u8_t cpyV = vec_ld( 0, src);
        vec_st(cpyV, 0, dst);

        src += i_src;
        dst += i_dst;
    }
}


static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
                             uint8_t *src[4], int i_src_stride,
                             int mvx, int mvy,
                             int i_width, int i_height, const x264_weight_t *weight )
{
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);

        switch( i_width )
        {
            case 4:
                x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
                break;
            case 8:
                x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
                break;
            case 16:
            default:
                x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
        }
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
    }
    else if( weight->weightfn )
        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
    else
    {
        switch( i_width )
        {
            case 4:
                x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
                break;
            case 8:
                x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
                break;
            case 16:
                x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
                break;
        }
    }
}



static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
                                 uint8_t *src[4], int i_src_stride,
                                 int mvx, int mvy,
                                 int i_width, int i_height, const x264_weight_t *weight )
{
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        switch( i_width )
        {
            case 4:
                x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
                break;
            case 8:
                x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
                break;
            case 12:
            case 16:
            default:
                x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
                break;
            case 20:
                x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
                break;
        }
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
}

static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
                           uint8_t *src, int i_src_stride,
                           int mvx, int mvy,
                           int i_height )
{
    uint8_t *srcp;
    int d8x = mvx&0x07;
    int d8y = mvy&0x07;

    int cA = (8-d8x)*(8-d8y);
    int cB = d8x    *(8-d8y);
    int cC = (8-d8x)*d8y;
    int cD = d8x    *d8y;

    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
    srcp = &src[i_src_stride];

    for( int y = 0; y < i_height; y++ )
    {
        dstu[0] = ( cA*src[0] + cB*src[2] + cC*srcp[0] + cD*srcp[2] + 32 ) >> 6;
        dstv[0] = ( cA*src[1] + cB*src[3] + cC*srcp[1] + cD*srcp[3] + 32 ) >> 6;
        dstu[1] = ( cA*src[2] + cB*src[4] + cC*srcp[2] + cD*srcp[4] + 32 ) >> 6;
        dstv[1] = ( cA*src[3] + cB*src[5] + cC*srcp[3] + cD*srcp[5] + 32 ) >> 6;

        src  += i_src_stride;
        srcp += i_src_stride;
        dstu += i_dst_stride;
        dstv += i_dst_stride;
    }
 }

static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
                                   uint8_t *src, int i_src_stride,
                                   int mvx, int mvy,
                                   int i_height )
{
    uint8_t *srcp;
    int d8x = mvx & 0x07;
    int d8y = mvy & 0x07;

    ALIGNED_16( uint16_t coeff[4] );
    coeff[0] = (8-d8x)*(8-d8y);
    coeff[1] = d8x    *(8-d8y);
    coeff[2] = (8-d8x)*d8y;
    coeff[3] = d8x    *d8y;

    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
    srcp = &src[i_src_stride];

    LOAD_ZERO;
    PREP_LOAD;
    PREP_LOAD_SRC( src );
    vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
    vec_u8_t    src2v_8, dstuv, dstvv;
    vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
    vec_u16_t   shiftv, k32v;

    static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
    static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);

    coeff0v = vec_ld( 0, coeff );
    coeff3v = vec_splat( coeff0v, 3 );
    coeff2v = vec_splat( coeff0v, 2 );
    coeff1v = vec_splat( coeff0v, 1 );
    coeff0v = vec_splat( coeff0v, 0 );
    k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
    shiftv  = vec_splat_u16( 6 );

    VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
    src2v_16 = vec_u8_to_u16( src2v_8 );
    src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );

    for( int y = 0; y < i_height; y += 2 )
    {
        src0v_16 = src2v_16;
        src1v_16 = src3v_16;
        VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
        src2v_16 = vec_u8_to_u16( src2v_8 );
        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );

        dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
        dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
        dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
        dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );

        dstv16 = vec_sr( dstv16, shiftv );

        dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
        dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
        vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
        vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );

        srcp += i_src_stride;
        dstu += i_dst_stride;
        dstv += i_dst_stride;

        src0v_16 = src2v_16;
        src1v_16 = src3v_16;
        VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
        src2v_16 = vec_u8_to_u16( src2v_8 );
        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );

        dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
        dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
        dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
        dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );

        dstv16 = vec_sr( dstv16, shiftv );

        dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
        dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
        vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
        vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );

        srcp += i_src_stride;
        dstu += i_dst_stride;
        dstv += i_dst_stride;
    }
}

static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
                                   uint8_t *src, int i_src_stride,
                                   int mvx, int mvy,
                                   int i_height )
{
    uint8_t *srcp;
    int d8x = mvx & 0x07;
    int d8y = mvy & 0x07;

    ALIGNED_16( uint16_t coeff[4] );
    coeff[0] = (8-d8x)*(8-d8y);
    coeff[1] = d8x    *(8-d8y);
    coeff[2] = (8-d8x)*d8y;
    coeff[3] = d8x    *d8y;

    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
    srcp = &src[i_src_stride];

    LOAD_ZERO;
    PREP_LOAD;
    PREP_LOAD_SRC( src );
    PREP_STORE8;
    vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
    vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
    vec_u8_t    dstuv, dstvv;
    vec_u16_t   src0v_16h, src1v_16h, src2v_16h, src3v_16h, dstv_16h;
    vec_u16_t   src0v_16l, src1v_16l, src2v_16l, src3v_16l, dstv_16l;
    vec_u16_t   shiftv, k32v;

    coeff0v = vec_ld( 0, coeff );
    coeff3v = vec_splat( coeff0v, 3 );
    coeff2v = vec_splat( coeff0v, 2 );
    coeff1v = vec_splat( coeff0v, 1 );
    coeff0v = vec_splat( coeff0v, 0 );
    k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
    shiftv  = vec_splat_u16( 6 );

    static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
    static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);

    VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
    VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
    src3v_8 = vec_sld( src2v_8, src3v_8, 2 );

    for( int y = 0; y < i_height; y += 2 )
    {
        src0v_8 = src2v_8;
        src1v_8 = src3v_8;
        VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
        VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );

        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );

        src0v_16h = vec_u8_to_u16_h( src0v_8 );
        src0v_16l = vec_u8_to_u16_l( src0v_8 );
        src1v_16h = vec_u8_to_u16_h( src1v_8 );
        src1v_16l = vec_u8_to_u16_l( src1v_8 );
        src2v_16h = vec_u8_to_u16_h( src2v_8 );
        src2v_16l = vec_u8_to_u16_l( src2v_8 );
        src3v_16h = vec_u8_to_u16_h( src3v_8 );
        src3v_16l = vec_u8_to_u16_l( src3v_8 );

        dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
        dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
        dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
        dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
        dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );

        dstv_16h = vec_sr( dstv_16h, shiftv );
        dstv_16l = vec_sr( dstv_16l, shiftv );

        dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
        dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );

        VEC_STORE8( dstuv, dstu );
        VEC_STORE8( dstvv, dstv );

        srcp += i_src_stride;
        dstu += i_dst_stride;
        dstv += i_dst_stride;

        src0v_8 = src2v_8;
        src1v_8 = src3v_8;
        VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
        VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );

        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );

        src0v_16h = vec_u8_to_u16_h( src0v_8 );
        src0v_16l = vec_u8_to_u16_l( src0v_8 );
        src1v_16h = vec_u8_to_u16_h( src1v_8 );
        src1v_16l = vec_u8_to_u16_l( src1v_8 );
        src2v_16h = vec_u8_to_u16_h( src2v_8 );
        src2v_16l = vec_u8_to_u16_l( src2v_8 );
        src3v_16h = vec_u8_to_u16_h( src3v_8 );
        src3v_16l = vec_u8_to_u16_l( src3v_8 );

        dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
        dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
        dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
        dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
        dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );

        dstv_16h = vec_sr( dstv_16h, shiftv );
        dstv_16l = vec_sr( dstv_16l, shiftv );

        dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
        dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );

        VEC_STORE8( dstuv, dstu );
        VEC_STORE8( dstvv, dstv );

        srcp += i_src_stride;
        dstu += i_dst_stride;
        dstv += i_dst_stride;
    }
}

static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
                               uint8_t *src, int i_src_stride,
                               int mvx, int mvy,
                               int i_width, int i_height )
{
    if( i_width == 8 )
        mc_chroma_altivec_8xh( dstu, dstv, i_dst_stride, src, i_src_stride,
                               mvx, mvy, i_height );
    else if( i_width == 4 )
        mc_chroma_altivec_4xh( dstu, dstv, i_dst_stride, src, i_src_stride,
                               mvx, mvy, i_height );
    else
        mc_chroma_2xh( dstu, dstv, i_dst_stride, src, i_src_stride,
                       mvx, mvy, i_height );
}

#define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
{                                                     \
    t1v = vec_add( t1v, t6v );                        \
    t2v = vec_add( t2v, t5v );                        \
    t3v = vec_add( t3v, t4v );                        \
                                                      \
    t1v = vec_sub( t1v, t2v );   /* (a-b) */          \
    t2v = vec_sub( t2v, t3v );   /* (b-c) */          \
    t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        \
    t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      \
    t3v = vec_sl(  t3v, fourv ); /* 16*c */           \
    t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     \
}

#define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
{                                                     \
    t1v = vec_add( t1v, t6v );                        \
    t2v = vec_add( t2v, t5v );                        \
    t3v = vec_add( t3v, t4v );                        \
                                                      \
    t1v = vec_sub( t1v, t2v );  /* (a-b) */           \
    t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         \
    t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       \
    t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     \
    t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
    t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
}

#define HPEL_FILTER_HORIZONTAL()                             \
{                                                            \
    VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
    VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
                                                             \
    src2v = vec_sld( src1v, src6v,  1 );                     \
    src3v = vec_sld( src1v, src6v,  2 );                     \
    src4v = vec_sld( src1v, src6v,  3 );                     \
    src5v = vec_sld( src1v, src6v,  4 );                     \
    src6v = vec_sld( src1v, src6v,  5 );                     \
                                                             \
    temp1v = vec_u8_to_s16_h( src1v );                       \
    temp2v = vec_u8_to_s16_h( src2v );                       \
    temp3v = vec_u8_to_s16_h( src3v );                       \
    temp4v = vec_u8_to_s16_h( src4v );                       \
    temp5v = vec_u8_to_s16_h( src5v );                       \
    temp6v = vec_u8_to_s16_h( src6v );                       \
                                                             \
    HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
                   temp4v, temp5v, temp6v );                 \
                                                             \
    dest1v = vec_add( temp1v, sixteenv );                    \
    dest1v = vec_sra( dest1v, fivev );                       \
                                                             \
    temp1v = vec_u8_to_s16_l( src1v );                       \
    temp2v = vec_u8_to_s16_l( src2v );                       \
    temp3v = vec_u8_to_s16_l( src3v );                       \
    temp4v = vec_u8_to_s16_l( src4v );                       \
    temp5v = vec_u8_to_s16_l( src5v );                       \
    temp6v = vec_u8_to_s16_l( src6v );                       \
                                                             \
    HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
                   temp4v, temp5v, temp6v );                 \
                                                             \
    dest2v = vec_add( temp1v, sixteenv );                    \
    dest2v = vec_sra( dest2v, fivev );                       \
                                                             \
    destv = vec_packsu( dest1v, dest2v );                    \
                                                             \
    VEC_STORE16( destv, &dsth[x+i_stride*y], dsth );         \
}

#define HPEL_FILTER_VERTICAL()                                    \
{                                                                 \
    VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
    VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
    VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
    VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
    VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
    VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
                                                                  \
    temp1v = vec_u8_to_s16_h( src1v );                            \
    temp2v = vec_u8_to_s16_h( src2v );                            \
    temp3v = vec_u8_to_s16_h( src3v );                            \
    temp4v = vec_u8_to_s16_h( src4v );                            \
    temp5v = vec_u8_to_s16_h( src5v );                            \
    temp6v = vec_u8_to_s16_h( src6v );                            \
                                                                  \
    HPEL_FILTER_1( temp1v, temp2v, temp3v,                        \
                   temp4v, temp5v, temp6v );                      \
                                                                  \
    dest1v = vec_add( temp1v, sixteenv );                         \
    dest1v = vec_sra( dest1v, fivev );                            \
                                                                  \
    temp4v = vec_u8_to_s16_l( src1v );                            \
    temp5v = vec_u8_to_s16_l( src2v );                            \
    temp6v = vec_u8_to_s16_l( src3v );                            \
    temp7v = vec_u8_to_s16_l( src4v );                            \
    temp8v = vec_u8_to_s16_l( src5v );                            \
    temp9v = vec_u8_to_s16_l( src6v );                            \
                                                                  \
    HPEL_FILTER_1( temp4v, temp5v, temp6v,                        \
                   temp7v, temp8v, temp9v );                      \
                                                                  \
    dest2v = vec_add( temp4v, sixteenv );                         \
    dest2v = vec_sra( dest2v, fivev );                            \
                                                                  \
    destv = vec_packsu( dest1v, dest2v );                         \
                                                                  \
    VEC_STORE16( destv, &dstv[x+i_stride*y], dsth );              \
}

#define HPEL_FILTER_CENTRAL()                           \
{                                                       \
    temp1v = vec_sld( tempav, tempbv, 12 );             \
    temp2v = vec_sld( tempav, tempbv, 14 );             \
    temp3v = tempbv;                                    \
    temp4v = vec_sld( tempbv, tempcv,  2 );             \
    temp5v = vec_sld( tempbv, tempcv,  4 );             \
    temp6v = vec_sld( tempbv, tempcv,  6 );             \
                                                        \
    HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                   temp4v, temp5v, temp6v );            \
                                                        \
    dest1v = vec_add( temp1v, thirtytwov );             \
    dest1v = vec_sra( dest1v, sixv );                   \
                                                        \
    temp1v = vec_sld( tempbv, tempcv, 12 );             \
    temp2v = vec_sld( tempbv, tempcv, 14 );             \
    temp3v = tempcv;                                    \
    temp4v = vec_sld( tempcv, tempdv,  2 );             \
    temp5v = vec_sld( tempcv, tempdv,  4 );             \
    temp6v = vec_sld( tempcv, tempdv,  6 );             \
                                                        \
    HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                   temp4v, temp5v, temp6v );            \
                                                        \
    dest2v = vec_add( temp1v, thirtytwov );             \
    dest2v = vec_sra( dest2v, sixv );                   \
                                                        \
    destv = vec_packsu( dest1v, dest2v );               \
                                                        \
    VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
}

void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
                               int i_stride, int i_width, int i_height, int16_t *buf )
{
    vec_u8_t destv;
    vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
    vec_s16_t dest1v, dest2v;
    vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
    vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;

    PREP_LOAD;
    PREP_LOAD_SRC( src);
    PREP_STORE16;
    PREP_STORE16_DST( dsth );
    LOAD_ZERO;

    vec_u16_t twov, fourv, fivev, sixv;
    vec_s16_t sixteenv, thirtytwov;
    vec_u16_u temp_u;

    temp_u.s[0]=2;
    twov = vec_splat( temp_u.v, 0 );
    temp_u.s[0]=4;
    fourv = vec_splat( temp_u.v, 0 );
    temp_u.s[0]=5;
    fivev = vec_splat( temp_u.v, 0 );
    temp_u.s[0]=6;
    sixv = vec_splat( temp_u.v, 0 );
    temp_u.s[0]=16;
    sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
    temp_u.s[0]=32;
    thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );

    for( int y = 0; y < i_height; y++ )
    {
        int x = 0;

        /* horizontal_filter */
        HPEL_FILTER_HORIZONTAL();

        /* vertical_filter */
        HPEL_FILTER_VERTICAL();

        /* central_filter */
        tempav = tempcv;
        tempbv = tempdv;
        tempcv = vec_splat( temp1v, 0 ); /* first only */
        tempdv = temp1v;
        tempev = temp4v;

        for( x = 16; x < i_width; x+=16 )
        {
            /* horizontal_filter */
            HPEL_FILTER_HORIZONTAL();

            /* vertical_filter */
            HPEL_FILTER_VERTICAL();

            /* central_filter */
            tempav = tempcv;
            tempbv = tempdv;
            tempcv = tempev;
            tempdv = temp1v;
            tempev = temp4v;

            HPEL_FILTER_CENTRAL();
        }

        /* Partial vertical filter */
        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );

        temp1v = vec_u8_to_s16_h( src1v );
        temp2v = vec_u8_to_s16_h( src2v );
        temp3v = vec_u8_to_s16_h( src3v );
        temp4v = vec_u8_to_s16_h( src4v );
        temp5v = vec_u8_to_s16_h( src5v );
        temp6v = vec_u8_to_s16_h( src6v );

        HPEL_FILTER_1( temp1v, temp2v, temp3v, temp4v, temp5v, temp6v );

        /* central_filter */
        tempav = tempcv;
        tempbv = tempdv;
        tempcv = tempev;
        tempdv = temp1v;
        /* tempev is not used */

        HPEL_FILTER_CENTRAL();
    }
}

static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                                           int src_stride, int dst_stride, int width, int height )
{
    int w = width >> 4;
    int end = (width & 15);
    vec_u8_t src0v, src1v, src2v;
    vec_u8_t lv, hv, src1p1v;
    vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
    static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E );

    for( int y = 0; y < height; y++ )
    {
        int x;
        uint8_t *src1 = src0+src_stride;
        uint8_t *src2 = src1+src_stride;

        src0v = vec_ld(0, src0);
        src1v = vec_ld(0, src1);
        src2v = vec_ld(0, src2);

        avg0v = vec_avg(src0v, src1v);
        avg1v = vec_avg(src1v, src2v);

        for( x = 0; x < w; x++ )
        {
            lv = vec_ld(16*(x*2+1), src0);
            src1v = vec_ld(16*(x*2+1), src1);
            avghv = vec_avg(lv, src1v);

            lv = vec_ld(16*(x*2+2), src0);
            src1p1v = vec_ld(16*(x*2+2), src1);
            avghp1v = vec_avg(lv, src1p1v);

            avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v);
            avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv);

            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0);
            vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth);

            avg0v = avghp1v;

            hv = vec_ld(16*(x*2+1), src2);
            avghv = vec_avg(src1v, hv);

            hv = vec_ld(16*(x*2+2), src2);
            avghp1v = vec_avg(src1p1v, hv);

            avgleftv = vec_avg(vec_sld(avg1v, avghv, 1), avg1v);
            avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv);

            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv);
            vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc);

            avg1v = avghp1v;

        }
        if( end )
        {
            lv = vec_ld(16*(x*2+1), src0);
            src1v = vec_ld(16*(x*2+1), src1);
            avghv = vec_avg(lv, src1v);

            lv = vec_ld(16*(x*2+1), src2);
            avghp1v = vec_avg(src1v, lv);

            avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v);
            avgrightv = vec_avg(vec_sld(avg1v, avghp1v, 1), avg1v);

            lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle);
            hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv);

            vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dst0);
            vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dst0);
            vec_ste((vec_u32_t)hv,16*x,(uint32_t*)dsth);
            vec_ste((vec_u32_t)hv,16*x+4,(uint32_t*)dsth);

            lv = vec_sld(lv, lv, 8);
            hv = vec_sld(hv, hv, 8);

            vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dstv);
            vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dstv);
            vec_ste((vec_u32_t)hv,16*x,(uint32_t*)dstc);
            vec_ste((vec_u32_t)hv,16*x+4,(uint32_t*)dstc);
        }

        src0 += src_stride*2;
        dst0 += dst_stride;
        dsth += dst_stride;
        dstv += dst_stride;
        dstc += dst_stride;
    }
}

static void mc_weight_w2_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
                                  const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    PREP_LOAD;
    PREP_LOAD_SRC( src );
    vec_u8_t srcv;
    vec_s16_t weightv;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            VEC_LOAD( src, srcv, 2, vec_u8_t, src );
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, roundv );
            weightv = vec_sra( weightv, (vec_u16_t)denomv );
            weightv = vec_add( weightv, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            VEC_LOAD( src, srcv, 2, vec_u8_t, src );
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst );
        }
    }
}
static void mc_weight_w4_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
                                  const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    PREP_LOAD;
    PREP_LOAD_SRC( src );
    vec_u8_t srcv;
    vec_s16_t weightv;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            VEC_LOAD( src, srcv, 4, vec_u8_t, src );
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, roundv );
            weightv = vec_sra( weightv, (vec_u16_t)denomv );
            weightv = vec_add( weightv, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            VEC_LOAD( src, srcv, 4, vec_u8_t, src );
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
        }
    }
}
static void mc_weight_w8_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
                                  const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    PREP_LOAD;
    PREP_LOAD_SRC( src );
    PREP_STORE8;
    vec_u8_t srcv;
    vec_s16_t weightv;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            VEC_LOAD( src, srcv, 8, vec_u8_t, src );
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, roundv );
            weightv = vec_sra( weightv, (vec_u16_t)denomv );
            weightv = vec_add( weightv, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            VEC_STORE8( srcv, dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            VEC_LOAD( src, srcv, 8, vec_u8_t, src );
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            VEC_STORE8( srcv, dst );
        }
    }
}
static void mc_weight_w16_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
                                   const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    PREP_LOAD;
    PREP_LOAD_SRC( src );
    vec_u8_t srcv;
    vec_s16_t weight_lv, weight_hv;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            VEC_LOAD( src, srcv, 16, vec_u8_t, src );
            weight_hv = vec_u8_to_s16_h( srcv );
            weight_lv = vec_u8_to_s16_l( srcv );

            weight_hv = vec_mladd( weight_hv, scalev, roundv );
            weight_lv = vec_mladd( weight_lv, scalev, roundv );
            weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
            weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
            weight_hv = vec_add( weight_hv, offsetv );
            weight_lv = vec_add( weight_lv, offsetv );

            srcv = vec_packsu( weight_hv, weight_lv );
            vec_st( srcv, 0, dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            VEC_LOAD( src, srcv, 16, vec_u8_t, src );
            weight_hv = vec_u8_to_s16_h( srcv );
            weight_lv = vec_u8_to_s16_l( srcv );

            weight_hv = vec_mladd( weight_hv, scalev, offsetv );
            weight_lv = vec_mladd( weight_lv, scalev, offsetv );

            srcv = vec_packsu( weight_hv, weight_lv );
            vec_st( srcv, 0, dst );
        }
    }
}
static void mc_weight_w20_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
                                   const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    PREP_LOAD_SRC( src );
    vec_u8_t src_1v, src_2v, src_3v;
    vec_s16_t weight_lv, weight_hv, weight_3v;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            src_1v = vec_ld( 0,  src );
            src_2v = vec_ld( 16, src );
            src_3v = vec_ld( 19, src );
            src_1v = vec_perm( src_1v, src_2v, _src_ );
            src_3v = vec_perm( src_2v, src_3v, _src_ );
            weight_hv = vec_u8_to_s16_h( src_1v );
            weight_lv = vec_u8_to_s16_l( src_1v );
            weight_3v = vec_u8_to_s16_h( src_3v );

            weight_hv = vec_mladd( weight_hv, scalev, roundv );
            weight_lv = vec_mladd( weight_lv, scalev, roundv );
            weight_3v = vec_mladd( weight_3v, scalev, roundv );
            weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
            weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
            weight_3v = vec_sra( weight_3v, (vec_u16_t)denomv );
            weight_hv = vec_add( weight_hv, offsetv );
            weight_lv = vec_add( weight_lv, offsetv );
            weight_3v = vec_add( weight_3v, offsetv );

            src_1v = vec_packsu( weight_hv, weight_lv );
            src_3v = vec_packsu( weight_3v, zero_s16v );
            vec_st( src_1v, 0, dst );
            vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
            src_1v = vec_ld( 0,  src );
            src_2v = vec_ld( 16, src );
            src_3v = vec_ld( 19, src );
            src_1v = vec_perm( src_1v, src_2v, _src_ );
            src_3v = vec_perm( src_2v, src_3v, _src_ );
            weight_hv = vec_u8_to_s16_h( src_1v );
            weight_lv = vec_u8_to_s16_l( src_1v );
            weight_3v = vec_u8_to_s16_h( src_3v );

            weight_hv = vec_mladd( weight_hv, scalev, offsetv );
            weight_lv = vec_mladd( weight_lv, scalev, offsetv );
            weight_3v = vec_mladd( weight_3v, scalev, offsetv );

            src_1v = vec_packsu( weight_hv, weight_lv );
            src_3v = vec_packsu( weight_3v, zero_s16v );
            vec_st( src_1v, 0, dst );
            vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
        }
    }
}

static weight_fn_t x264_mc_weight_wtab_altivec[6] =
{
    mc_weight_w2_altivec,
    mc_weight_w4_altivec,
    mc_weight_w8_altivec,
    mc_weight_w16_altivec,
    mc_weight_w16_altivec,
    mc_weight_w20_altivec,
};

#endif // !HIGH_BIT_DEPTH

void x264_mc_altivec_init( x264_mc_functions_t *pf )
{
#if !HIGH_BIT_DEPTH
    pf->mc_luma   = mc_luma_altivec;
    pf->get_ref   = get_ref_altivec;
    pf->mc_chroma = mc_chroma_altivec;

    pf->copy_16x16_unaligned = x264_mc_copy_w16_altivec;
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_altivec;

    pf->hpel_filter = x264_hpel_filter_altivec;
    pf->frame_init_lowres_core = frame_init_lowres_core_altivec;

    pf->weight = x264_mc_weight_wtab_altivec;
#endif // !HIGH_BIT_DEPTH
}
x264-snapshot-20120103-2245-stable/common/ppc/deblock.c0000644000175000001440000004212511700673342021516 0ustar  videolanusers/*****************************************************************************
 * deblock.c: ppc deblocking
 *****************************************************************************
 * Copyright (C) 2007-2011 x264 project
 *
 * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "ppccommon.h"

#if !HIGH_BIT_DEPTH
#define transpose4x16(r0, r1, r2, r3)        \
{                                            \
    register vec_u8_t r4;                    \
    register vec_u8_t r5;                    \
    register vec_u8_t r6;                    \
    register vec_u8_t r7;                    \
                                             \
    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
    r6 = vec_mergeh(r1, r3);  /*1, 3 set 0*/ \
    r7 = vec_mergel(r1, r3);  /*1, 3 set 1*/ \
                                             \
    r0 = vec_mergeh(r4, r6);  /*all set 0*/  \
    r1 = vec_mergel(r4, r6);  /*all set 1*/  \
    r2 = vec_mergeh(r5, r7);  /*all set 2*/  \
    r3 = vec_mergel(r5, r7);  /*all set 3*/  \
}

static inline void write16x4( uint8_t *dst, int dst_stride,
                              register vec_u8_t r0, register vec_u8_t r1,
                              register vec_u8_t r2, register vec_u8_t r3 )
{
    ALIGNED_16(unsigned char result[64]);
    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
    int int_dst_stride = dst_stride >> 2;

    vec_st(r0, 0, result);
    vec_st(r1, 16, result);
    vec_st(r2, 32, result);
    vec_st(r3, 48, result);
    /* FIXME: there has to be a better way!!!! */
    *dst_int = *src_int;
    *(dst_int+   int_dst_stride) = *(src_int + 1);
    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
    *(dst_int+10*int_dst_stride) = *(src_int + 10);
    *(dst_int+11*int_dst_stride) = *(src_int + 11);
    *(dst_int+12*int_dst_stride) = *(src_int + 12);
    *(dst_int+13*int_dst_stride) = *(src_int + 13);
    *(dst_int+14*int_dst_stride) = *(src_int + 14);
    *(dst_int+15*int_dst_stride) = *(src_int + 15);
}

/** \brief performs a 6x16 transpose of data in src, and stores it to dst */
#define read_and_transpose16x6(src, src_stride, r8, r9, r10, r11, r12, r13)\
{\
    register vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15;\
    VEC_LOAD(src,                  r0, 16, vec_u8_t, pix );    \
    VEC_LOAD(src +    src_stride,  r1, 16, vec_u8_t, pix );    \
    VEC_LOAD(src +  2*src_stride,  r2, 16, vec_u8_t, pix );    \
    VEC_LOAD(src +  3*src_stride,  r3, 16, vec_u8_t, pix );    \
    VEC_LOAD(src +  4*src_stride,  r4, 16, vec_u8_t, pix );    \
    VEC_LOAD(src +  5*src_stride,  r5, 16, vec_u8_t, pix );    \
    VEC_LOAD(src +  6*src_stride,  r6, 16, vec_u8_t, pix );    \
    VEC_LOAD(src +  7*src_stride,  r7, 16, vec_u8_t, pix );    \
    VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t, pix );    \
    VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t, pix );    \
                                                               \
    VEC_LOAD(src + 8*src_stride,   r8, 16, vec_u8_t, pix );    \
    VEC_LOAD(src + 9*src_stride,   r9, 16, vec_u8_t, pix );    \
    VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t, pix );    \
    VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t, pix );    \
    VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t, pix );    \
    VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t, pix );    \
                                                               \
    /*Merge first pairs*/                                      \
    r0 = vec_mergeh(r0, r8);    /*0, 8*/                       \
    r1 = vec_mergeh(r1, r9);    /*1, 9*/                       \
    r2 = vec_mergeh(r2, r10);   /*2,10*/                       \
    r3 = vec_mergeh(r3, r11);   /*3,11*/                       \
    r4 = vec_mergeh(r4, r12);   /*4,12*/                       \
    r5 = vec_mergeh(r5, r13);   /*5,13*/                       \
    r6 = vec_mergeh(r6, r14);   /*6,14*/                       \
    r7 = vec_mergeh(r7, r15);   /*7,15*/                       \
                                                               \
    /*Merge second pairs*/                                     \
    r8  = vec_mergeh(r0, r4);   /*0,4, 8,12 set 0*/            \
    r9  = vec_mergel(r0, r4);   /*0,4, 8,12 set 1*/            \
    r10 = vec_mergeh(r1, r5);   /*1,5, 9,13 set 0*/            \
    r11 = vec_mergel(r1, r5);   /*1,5, 9,13 set 1*/            \
    r12 = vec_mergeh(r2, r6);   /*2,6,10,14 set 0*/            \
    r13 = vec_mergel(r2, r6);   /*2,6,10,14 set 1*/            \
    r14 = vec_mergeh(r3, r7);   /*3,7,11,15 set 0*/            \
    r15 = vec_mergel(r3, r7);   /*3,7,11,15 set 1*/            \
                                                               \
    /*Third merge*/                                            \
    r0 = vec_mergeh(r8, r12);   /*0,2,4,6,8,10,12,14 set 0*/   \
    r1 = vec_mergel(r8, r12);   /*0,2,4,6,8,10,12,14 set 1*/   \
    r2 = vec_mergeh(r9, r13);   /*0,2,4,6,8,10,12,14 set 2*/   \
    r4 = vec_mergeh(r10, r14);  /*1,3,5,7,9,11,13,15 set 0*/   \
    r5 = vec_mergel(r10, r14);  /*1,3,5,7,9,11,13,15 set 1*/   \
    r6 = vec_mergeh(r11, r15);  /*1,3,5,7,9,11,13,15 set 2*/   \
    /* Don't need to compute 3 and 7*/                         \
                                                               \
    /*Final merge*/                                            \
    r8  = vec_mergeh(r0, r4);   /*all set 0*/                  \
    r9  = vec_mergel(r0, r4);   /*all set 1*/                  \
    r10 = vec_mergeh(r1, r5);   /*all set 2*/                  \
    r11 = vec_mergel(r1, r5);   /*all set 3*/                  \
    r12 = vec_mergeh(r2, r6);   /*all set 4*/                  \
    r13 = vec_mergel(r2, r6);   /*all set 5*/                  \
    /* Don't need to compute 14 and 15*/                       \
                                                               \
}

// out: o = |x-y| < a
static inline vec_u8_t diff_lt_altivec( register vec_u8_t x, register vec_u8_t y, register vec_u8_t a )
{
    register vec_u8_t diff = vec_subs(x, y);
    register vec_u8_t diffneg = vec_subs(y, x);
    register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
    o = (vec_u8_t)vec_cmplt(o, a);
    return o;
}

static inline vec_u8_t h264_deblock_mask( register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t q0,
                                          register vec_u8_t q1, register vec_u8_t alpha, register vec_u8_t beta )
{
    register vec_u8_t mask;
    register vec_u8_t tempmask;

    mask = diff_lt_altivec(p0, q0, alpha);
    tempmask = diff_lt_altivec(p1, p0, beta);
    mask = vec_and(mask, tempmask);
    tempmask = diff_lt_altivec(q1, q0, beta);
    mask = vec_and(mask, tempmask);

    return mask;
}

// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vec_u8_t h264_deblock_q1( register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2,
                                        register vec_u8_t q0, register vec_u8_t tc0 )
{

    register vec_u8_t average = vec_avg(p0, q0);
    register vec_u8_t temp;
    register vec_u8_t uncliped;
    register vec_u8_t ones;
    register vec_u8_t max;
    register vec_u8_t min;
    register vec_u8_t newp1;

    temp = vec_xor(average, p2);
    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
    ones = vec_splat_u8(1);
    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
    max = vec_adds(p1, tc0);
    min = vec_subs(p1, tc0);
    newp1 = vec_max(min, uncliped);
    newp1 = vec_min(max, newp1);
    return newp1;
}

#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked)                                           \
{                                                                                               \
    const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                             \
                                                                                                \
    register vec_u8_t pq0bit = vec_xor(p0,q0);                                                  \
    register vec_u8_t q1minus;                                                                  \
    register vec_u8_t p0minus;                                                                  \
    register vec_u8_t stage1;                                                                   \
    register vec_u8_t stage2;                                                                   \
    register vec_u8_t vec160;                                                                   \
    register vec_u8_t delta;                                                                    \
    register vec_u8_t deltaneg;                                                                 \
                                                                                                \
    q1minus = vec_nor(q1, q1);                /* 255 - q1 */                                    \
    stage1 = vec_avg(p1, q1minus);            /* (p1 - q1 + 256)>>1 */                          \
    stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */    \
    p0minus = vec_nor(p0, p0);                /* 255 - p0 */                                    \
    stage1 = vec_avg(q0, p0minus);            /* (q0 - p0 + 256)>>1 */                          \
    pq0bit = vec_and(pq0bit, vec_splat_u8(1));                                                  \
    stage2 = vec_avg(stage2, pq0bit);         /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */\
    stage2 = vec_adds(stage2, stage1);        /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
    vec160 = vec_ld(0, &A0v);                                                                   \
    deltaneg = vec_subs(vec160, stage2);      /* -d */                                          \
    delta = vec_subs(stage2, vec160);         /*  d */                                          \
    deltaneg = vec_min(tc0masked, deltaneg);                                                    \
    delta = vec_min(tc0masked, delta);                                                          \
    p0 = vec_subs(p0, deltaneg);                                                                \
    q0 = vec_subs(q0, delta);                                                                   \
    p0 = vec_adds(p0, delta);                                                                   \
    q0 = vec_adds(q0, deltaneg);                                                                \
}

#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0)              \
{                                                                                            \
    ALIGNED_16(unsigned char temp[16]);                                                      \
    register vec_u8_t alphavec;                                                              \
    register vec_u8_t betavec;                                                               \
    register vec_u8_t mask;                                                                  \
    register vec_u8_t p1mask;                                                                \
    register vec_u8_t q1mask;                                                                \
    register vec_s8_t tc0vec;                                                                \
    register vec_u8_t finaltc0;                                                              \
    register vec_u8_t tc0masked;                                                             \
    register vec_u8_t newp1;                                                                 \
    register vec_u8_t newq1;                                                                 \
                                                                                             \
    temp[0] = alpha;                                                                         \
    temp[1] = beta;                                                                          \
    alphavec = vec_ld(0, temp);                                                              \
    betavec = vec_splat(alphavec, 0x1);                                                      \
    alphavec = vec_splat(alphavec, 0x0);                                                     \
    mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */            \
                                                                                             \
    M32( temp ) = M32( tc0 );                                                                \
    tc0vec = vec_ld(0, (signed char*)temp);                                                  \
    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
    finaltc0 = vec_and((vec_u8_t)tc0vec, mask);                 /* tc = tc0 */               \
                                                                                             \
    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
    p1mask = vec_and(p1mask, mask);                             /* if( |p2 - p0| < beta) */  \
    tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec);                                           \
    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
    /*end if*/                                                                               \
                                                                                             \
    q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
    q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
    tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec);                                           \
    finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
    /*end if*/                                                                               \
                                                                                             \
    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);                                            \
    p1 = newp1;                                                                              \
    q1 = newq1;                                                                              \
}

void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
    if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0 )
    {
        register vec_u8_t p2 = vec_ld(-3*stride, pix);
        register vec_u8_t p1 = vec_ld(-2*stride, pix);
        register vec_u8_t p0 = vec_ld(-1*stride, pix);
        register vec_u8_t q0 = vec_ld(0, pix);
        register vec_u8_t q1 = vec_ld(stride, pix);
        register vec_u8_t q2 = vec_ld(2*stride, pix);
        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
        vec_st(p1, -2*stride, pix);
        vec_st(p0, -1*stride, pix);
        vec_st(q0, 0, pix);
        vec_st(q1, stride, pix);
    }
}

void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{

    register vec_u8_t line0, line1, line2, line3, line4, line5;
    if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0 )
        return;
    PREP_LOAD;
    vec_u8_t _pix_ = vec_lvsl(0, pix-3);
    read_and_transpose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
    h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
    transpose4x16(line1, line2, line3, line4);
    write16x4(pix-2, stride, line1, line2, line3, line4);
}
#endif // !HIGH_BIT_DEPTH
x264-snapshot-20120103-2245-stable/common/ppc/dct.h0000644000175000001440000000430111700673342020664 0ustar  videolanusers/*****************************************************************************
 * dct.h: ppc transform and zigzag
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Eric Petit <eric.petit@lapsus.org>
 *          Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_PPC_DCT_H
#define X264_PPC_DCT_H

void x264_sub4x4_dct_altivec( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_altivec( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_altivec( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );

void x264_add4x4_idct_altivec( uint8_t *p_dst, int16_t dct[16] );
void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][16] );
void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] );

void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );

void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] );
void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][64] );

void x264_zigzag_scan_4x4_frame_altivec( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[16] );

#endif
x264-snapshot-20120103-2245-stable/common/ppc/dct.c0000644000175000001440000005174111700673342020671 0ustar  videolanusers/*****************************************************************************
 * dct.c: ppc transform and zigzag
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
 *          Eric Petit <eric.petit@lapsus.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "ppccommon.h"

#if !HIGH_BIT_DEPTH
#define VEC_DCT(a0,a1,a2,a3,b0,b1,b2,b3) \
    b1 = vec_add( a0, a3 );              \
    b3 = vec_add( a1, a2 );              \
    b0 = vec_add( b1, b3 );              \
    b2 = vec_sub( b1, b3 );              \
    a0 = vec_sub( a0, a3 );              \
    a1 = vec_sub( a1, a2 );              \
    b1 = vec_add( a0, a0 );              \
    b1 = vec_add( b1, a1 );              \
    b3 = vec_sub( a0, a1 );              \
    b3 = vec_sub( b3, a1 )

void x264_sub4x4_dct_altivec( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
{
    PREP_DIFF_8BYTEALIGNED;
    vec_s16_t dct0v, dct1v, dct2v, dct3v;
    vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v;

    vec_u8_t permHighv;

    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v );
    VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
    VEC_TRANSPOSE_4( tmp0v, tmp1v, tmp2v, tmp3v,
                     dct0v, dct1v, dct2v, dct3v );
    permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
    VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );

    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0,  dct);
    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
}

void x264_sub8x8_dct_altivec( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
{
    PREP_DIFF_8BYTEALIGNED;
    vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v;
    vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v, tmp4v, tmp5v, tmp6v, tmp7v;

    vec_u8_t permHighv, permLowv;

    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
    VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
    VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
    VEC_TRANSPOSE_8( tmp0v, tmp1v, tmp2v, tmp3v,
                     tmp4v, tmp5v, tmp6v, tmp7v,
                     dct0v, dct1v, dct2v, dct3v,
                     dct4v, dct5v, dct6v, dct7v );

    permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
    permLowv  = (vec_u8_t) CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);

    VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
    VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );

    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0,   *dct);
    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16,  *dct);
    vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32,  *dct);
    vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48,  *dct);
    vec_st(vec_perm(tmp0v, tmp1v, permLowv),  64,  *dct);
    vec_st(vec_perm(tmp2v, tmp3v, permLowv),  80,  *dct);
    vec_st(vec_perm(tmp4v, tmp5v, permLowv),  96,  *dct);
    vec_st(vec_perm(tmp6v, tmp7v, permLowv),  112, *dct);
}

void x264_sub16x16_dct_altivec( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
{
    x264_sub8x8_dct_altivec( &dct[ 0], &pix1[0], &pix2[0] );
    x264_sub8x8_dct_altivec( &dct[ 4], &pix1[8], &pix2[8] );
    x264_sub8x8_dct_altivec( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
    x264_sub8x8_dct_altivec( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}

/***************************************************************************
 * 8x8 transform:
 ***************************************************************************/

/* DCT8_1D unrolled by 8 in Altivec */
#define DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ) \
{ \
    /* int s07 = SRC(0) + SRC(7);         */ \
    vec_s16_t s07v = vec_add( dct0v, dct7v); \
    /* int s16 = SRC(1) + SRC(6);         */ \
    vec_s16_t s16v = vec_add( dct1v, dct6v); \
    /* int s25 = SRC(2) + SRC(5);         */ \
    vec_s16_t s25v = vec_add( dct2v, dct5v); \
    /* int s34 = SRC(3) + SRC(4);         */ \
    vec_s16_t s34v = vec_add( dct3v, dct4v); \
\
    /* int a0 = s07 + s34;                */ \
    vec_s16_t a0v = vec_add(s07v, s34v);     \
    /* int a1 = s16 + s25;                */ \
    vec_s16_t a1v = vec_add(s16v, s25v);     \
    /* int a2 = s07 - s34;                */ \
    vec_s16_t a2v = vec_sub(s07v, s34v);     \
    /* int a3 = s16 - s25;                */ \
    vec_s16_t a3v = vec_sub(s16v, s25v);     \
\
    /* int d07 = SRC(0) - SRC(7);         */ \
    vec_s16_t d07v = vec_sub( dct0v, dct7v); \
    /* int d16 = SRC(1) - SRC(6);         */ \
    vec_s16_t d16v = vec_sub( dct1v, dct6v); \
    /* int d25 = SRC(2) - SRC(5);         */ \
    vec_s16_t d25v = vec_sub( dct2v, dct5v); \
    /* int d34 = SRC(3) - SRC(4);         */ \
    vec_s16_t d34v = vec_sub( dct3v, dct4v); \
\
    /* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \
    vec_s16_t a4v = vec_add( vec_add(d16v, d25v), vec_add(d07v, vec_sra(d07v, onev)) );\
    /* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \
    vec_s16_t a5v = vec_sub( vec_sub(d07v, d34v), vec_add(d25v, vec_sra(d25v, onev)) );\
    /* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \
    vec_s16_t a6v = vec_sub( vec_add(d07v, d34v), vec_add(d16v, vec_sra(d16v, onev)) );\
    /* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \
    vec_s16_t a7v = vec_add( vec_sub(d16v, d25v), vec_add(d34v, vec_sra(d34v, onev)) );\
\
    /* DST(0) =  a0 + a1;                    */ \
    dct0v = vec_add( a0v, a1v );                \
    /* DST(1) =  a4 + (a7>>2);               */ \
    dct1v = vec_add( a4v, vec_sra(a7v, twov) ); \
    /* DST(2) =  a2 + (a3>>1);               */ \
    dct2v = vec_add( a2v, vec_sra(a3v, onev) ); \
    /* DST(3) =  a5 + (a6>>2);               */ \
    dct3v = vec_add( a5v, vec_sra(a6v, twov) ); \
    /* DST(4) =  a0 - a1;                    */ \
    dct4v = vec_sub( a0v, a1v );                \
    /* DST(5) =  a6 - (a5>>2);               */ \
    dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); \
    /* DST(6) = (a2>>1) - a3 ;               */ \
    dct6v = vec_sub( vec_sra(a2v, onev), a3v ); \
    /* DST(7) = (a4>>2) - a7 ;               */ \
    dct7v = vec_sub( vec_sra(a4v, twov), a7v ); \
}


void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
{
    vec_u16_t onev = vec_splat_u16(1);
    vec_u16_t twov = vec_add( onev, onev );

    PREP_DIFF_8BYTEALIGNED;

    vec_s16_t dct0v, dct1v, dct2v, dct3v,
              dct4v, dct5v, dct6v, dct7v;

    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );

    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );

    DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v,
                     dct4v, dct5v, dct6v, dct7v );

    vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
        dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v;

    VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v,
                    dct4v, dct5v, dct6v, dct7v,
                    dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
                    dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );

    DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
                     dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );

    vec_st( dct_tr0v,  0,  dct );
    vec_st( dct_tr1v, 16,  dct );
    vec_st( dct_tr2v, 32,  dct );
    vec_st( dct_tr3v, 48,  dct );

    vec_st( dct_tr4v, 64,  dct );
    vec_st( dct_tr5v, 80,  dct );
    vec_st( dct_tr6v, 96,  dct );
    vec_st( dct_tr7v, 112, dct );
}

void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
{
    x264_sub8x8_dct8_altivec( dct[0], &pix1[0],               &pix2[0] );
    x264_sub8x8_dct8_altivec( dct[1], &pix1[8],               &pix2[8] );
    x264_sub8x8_dct8_altivec( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
    x264_sub8x8_dct8_altivec( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}


/****************************************************************************
 * IDCT transform:
 ****************************************************************************/

#define IDCT_1D_ALTIVEC(s0, s1, s2, s3,  d0, d1, d2, d3) \
{                                                        \
    /*        a0  = SRC(0) + SRC(2); */                  \
    vec_s16_t a0v = vec_add(s0, s2);                     \
    /*        a1  = SRC(0) - SRC(2); */                  \
    vec_s16_t a1v = vec_sub(s0, s2);                     \
    /*        a2  =           (SRC(1)>>1) - SRC(3); */   \
    vec_s16_t a2v = vec_sub(vec_sra(s1, onev), s3);      \
    /*        a3  =           (SRC(3)>>1) + SRC(1); */   \
    vec_s16_t a3v = vec_add(vec_sra(s3, onev), s1);      \
    /* DST(0,    a0 + a3); */                            \
    d0 = vec_add(a0v, a3v);                              \
    /* DST(1,    a1 + a2); */                            \
    d1 = vec_add(a1v, a2v);                              \
    /* DST(2,    a1 - a2); */                            \
    d2 = vec_sub(a1v, a2v);                              \
    /* DST(3,    a0 - a3); */                            \
    d3 = vec_sub(a0v, a3v);                              \
}

#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)             \
    vdst_orig = vec_ld(0, dst);                      \
    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
    vdst_ss = (vec_s16_t)vec_mergeh(zero_u8v, vdst); \
    va = vec_add(va, vdst_ss);                       \
    va_u8 = vec_s16_to_u8(va);                       \
    va_u32 = vec_splat((vec_u32_t)va_u8, 0);         \
    vec_ste(va_u32, element, (uint32_t*)dst);

#define ALTIVEC_STORE4_SUM_CLIP(dest, idctv, perm_ldv)          \
{                                                               \
    /* unaligned load */                                        \
    vec_u8_t lv = vec_ld(0, dest);                              \
    vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
    vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                  \
    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);    \
    vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);   \
    vec_u8_t idstsum8 = vec_s16_to_u8(idstsum);                 \
    /* unaligned store */                                       \
    vec_u32_t bodyv = vec_splat((vec_u32_t)idstsum8, 0);        \
    int element = ((unsigned long)dest & 0xf) >> 2;             \
    vec_ste(bodyv, element, (uint32_t *)dest);                  \
}

void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] )
{
    vec_u16_t onev = vec_splat_u16(1);

    dct[0] += 32; // rounding for the >>6 at the end

    vec_s16_t s0, s1, s2, s3;

    s0 = vec_ld( 0x00, dct );
    s1 = vec_sld( s0, s0, 8 );
    s2 = vec_ld( 0x10, dct );
    s3 = vec_sld( s2, s2, 8 );

    vec_s16_t d0, d1, d2, d3;
    IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 );

    vec_s16_t tr0, tr1, tr2, tr3;

    VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 );

    vec_s16_t idct0, idct1, idct2, idct3;
    IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 );

    vec_u8_t perm_ldv = vec_lvsl( 0, dst );
    vec_u16_t sixv = vec_splat_u16(6);
    LOAD_ZERO;

    ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv );
}

void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][16] )
{
    x264_add4x4_idct_altivec( &p_dst[0],               dct[0] );
    x264_add4x4_idct_altivec( &p_dst[4],               dct[1] );
    x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2] );
    x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}

void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] )
{
    x264_add8x8_idct_altivec( &p_dst[0],               &dct[0] );
    x264_add8x8_idct_altivec( &p_dst[8],               &dct[4] );
    x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
    x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
}

#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7)\
{\
    /*        a0  = SRC(0) + SRC(4); */ \
    vec_s16_t a0v = vec_add(s0, s4);    \
    /*        a2  = SRC(0) - SRC(4); */ \
    vec_s16_t a2v = vec_sub(s0, s4);    \
    /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
    vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6);    \
    /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
    vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2);    \
    /*        b0  =         a0 + a6; */ \
    vec_s16_t b0v = vec_add(a0v, a6v);  \
    /*        b2  =         a2 + a4; */ \
    vec_s16_t b2v = vec_add(a2v, a4v);  \
    /*        b4  =         a2 - a4; */ \
    vec_s16_t b4v = vec_sub(a2v, a4v);  \
    /*        b6  =         a0 - a6; */ \
    vec_s16_t b6v = vec_sub(a0v, a6v);  \
    /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
    /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
    vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) );\
    /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
    /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
    vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
    /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
    /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */  \
    vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
    /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */  \
    vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
    /*        b1 =                  (a7>>2)  +  a1; */  \
    vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v);  \
    /*        b3 =          a3 +        (a5>>2); */     \
    vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov));   \
    /*        b5 =                  (a3>>2)  -   a5; */ \
    vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v);  \
    /*        b7 =           a7 -        (a1>>2); */    \
    vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov));  \
    /* DST(0,    b0 + b7); */ \
    d0 = vec_add(b0v, b7v); \
    /* DST(1,    b2 + b5); */ \
    d1 = vec_add(b2v, b5v); \
    /* DST(2,    b4 + b3); */ \
    d2 = vec_add(b4v, b3v); \
    /* DST(3,    b6 + b1); */ \
    d3 = vec_add(b6v, b1v); \
    /* DST(4,    b6 - b1); */ \
    d4 = vec_sub(b6v, b1v); \
    /* DST(5,    b4 - b3); */ \
    d5 = vec_sub(b4v, b3v); \
    /* DST(6,    b2 - b5); */ \
    d6 = vec_sub(b2v, b5v); \
    /* DST(7,    b0 - b7); */ \
    d7 = vec_sub(b0v, b7v); \
}

#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel)\
{\
    /* unaligned load */                                       \
    vec_u8_t hv = vec_ld( 0, dest );                           \
    vec_u8_t lv = vec_ld( 7, dest );                           \
    vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
    vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);   \
    vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
    vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
    /* unaligned store */                                      \
    vec_u8_t bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
    vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
    lv    = vec_sel( lv, bodyv, edgelv );                      \
    vec_st( lv, 7, dest );                                     \
    hv    = vec_ld( 0, dest );                                 \
    vec_u8_t edgehv = vec_perm( zero_u8v, sel, perm_stv );     \
    hv    = vec_sel( hv, bodyv, edgehv );                      \
    vec_st( hv, 0, dest );                                     \
}

void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] )
{
    vec_u16_t onev = vec_splat_u16(1);
    vec_u16_t twov = vec_splat_u16(2);

    dct[0] += 32; // rounding for the >>6 at the end

    vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;

    s0 = vec_ld(0x00, dct);
    s1 = vec_ld(0x10, dct);
    s2 = vec_ld(0x20, dct);
    s3 = vec_ld(0x30, dct);
    s4 = vec_ld(0x40, dct);
    s5 = vec_ld(0x50, dct);
    s6 = vec_ld(0x60, dct);
    s7 = vec_ld(0x70, dct);

    vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7);

    vec_s16_t tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7;

    VEC_TRANSPOSE_8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
                    tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7);

    vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
    IDCT8_1D_ALTIVEC(tr0,     tr1,   tr2,   tr3,   tr4,   tr5,   tr6,   tr7,
                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);

    vec_u8_t perm_ldv = vec_lvsl(0, dst);
    vec_u8_t perm_stv = vec_lvsr(8, dst);
    vec_u16_t sixv = vec_splat_u16(6);
    const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
    LOAD_ZERO;

    ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7, perm_ldv, perm_stv, sel);
}

void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][64] )
{
    x264_add8x8_idct8_altivec( &dst[0],               dct[0] );
    x264_add8x8_idct8_altivec( &dst[8],               dct[1] );
    x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+0], dct[2] );
    x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+8], dct[3] );
}

void x264_zigzag_scan_4x4_frame_altivec( int16_t level[16], int16_t dct[16] )
{
    vec_s16_t dct0v, dct1v;
    vec_s16_t tmp0v, tmp1v;

    dct0v = vec_ld(0x00, dct);
    dct1v = vec_ld(0x10, dct);

    const vec_u8_t sel0 = (vec_u8_t) CV(0,1,8,9,2,3,4,5,10,11,16,17,24,25,18,19);
    const vec_u8_t sel1 = (vec_u8_t) CV(12,13,6,7,14,15,20,21,26,27,28,29,22,23,30,31);

    tmp0v = vec_perm( dct0v, dct1v, sel0 );
    tmp1v = vec_perm( dct0v, dct1v, sel1 );

    vec_st( tmp0v, 0x00, level );
    vec_st( tmp1v, 0x10, level );
}

void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[16] )
{
    vec_s16_t dct0v, dct1v;
    vec_s16_t tmp0v, tmp1v;

    dct0v = vec_ld(0x00, dct);
    dct1v = vec_ld(0x10, dct);

    const vec_u8_t sel0 = (vec_u8_t) CV(0,1,2,3,8,9,4,5,6,7,10,11,12,13,14,15);

    tmp0v = vec_perm( dct0v, dct1v, sel0 );
    tmp1v = dct1v;

    vec_st( tmp0v, 0x00, level );
    vec_st( tmp1v, 0x10, level );
}
#endif // !HIGH_BIT_DEPTH

x264-snapshot-20120103-2245-stable/common/pixel.h0000644000175000001440000001576511700673342020471 0ustar  videolanusers/*****************************************************************************
 * pixel.c: pixel metrics
 *****************************************************************************
 * Copyright (C) 2004-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
            Henrik Gramner <hengar-6@student.ltu.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_PIXEL_H
#define X264_PIXEL_H

// SSD assumes all args aligned
// other cmp functions assume first arg aligned
typedef int  (*x264_pixel_cmp_t) ( pixel *, int, pixel *, int );
typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, int, int[3] );
typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int[4] );

enum
{
    PIXEL_16x16 = 0,
    PIXEL_16x8  = 1,
    PIXEL_8x16  = 2,
    PIXEL_8x8   = 3,
    PIXEL_8x4   = 4,
    PIXEL_4x8   = 5,
    PIXEL_4x4   = 6,

    /* Subsampled chroma only */
    PIXEL_4x16  = 7,  /* 4:2:2 */
    PIXEL_4x2   = 8,
    PIXEL_2x8   = 9,  /* 4:2:2 */
    PIXEL_2x4   = 10,
    PIXEL_2x2   = 11,
};

static const struct { uint8_t w, h; } x264_pixel_size[12] =
{
    { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 },
    {  4, 16 }, {  4, 2 }, { 2,  8 }, { 2, 4 }, { 2, 2 },
};

static const uint8_t x264_size2pixel[5][5] =
{
    { 0, },
    { 0, PIXEL_4x4, PIXEL_8x4, 0, 0 },
    { 0, PIXEL_4x8, PIXEL_8x8, 0, PIXEL_16x8 },
    { 0, },
    { 0, 0,        PIXEL_8x16, 0, PIXEL_16x16 }
};

static const uint8_t x264_luma2chroma_pixel[4][7] =
{
    { 0 },
    { PIXEL_8x8,   PIXEL_8x4,  PIXEL_4x8,  PIXEL_4x4, PIXEL_4x2, PIXEL_2x4, PIXEL_2x2 }, /* 4:2:0 */
    { PIXEL_8x16,  PIXEL_8x8,  PIXEL_4x16, PIXEL_4x8, PIXEL_4x4, PIXEL_2x8, PIXEL_2x4 }, /* 4:2:2 */
    { PIXEL_16x16, PIXEL_16x8, PIXEL_8x16, PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 }, /* 4:4:4 */
};

typedef struct
{
    x264_pixel_cmp_t  sad[8];
    x264_pixel_cmp_t  ssd[8];
    x264_pixel_cmp_t satd[8];
    x264_pixel_cmp_t ssim[7];
    x264_pixel_cmp_t sa8d[4];
    x264_pixel_cmp_t mbcmp[8]; /* either satd or sad for subpel refine and mode decision */
    x264_pixel_cmp_t mbcmp_unaligned[8]; /* unaligned mbcmp for subpel */
    x264_pixel_cmp_t fpelcmp[8]; /* either satd or sad for fullpel motion search */
    x264_pixel_cmp_x3_t fpelcmp_x3[7];
    x264_pixel_cmp_x4_t fpelcmp_x4[7];
    x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
    int (*vsad)( pixel *, int, int );

    uint64_t (*var[4])( pixel *pix, int stride );
    int (*var2[4])( pixel *pix1, int stride1,
                    pixel *pix2, int stride2, int *ssd );
    uint64_t (*hadamard_ac[4])( pixel *pix, int stride );

    void (*ssd_nv12_core)( pixel *pixuv1, int stride1,
                           pixel *pixuv2, int stride2, int width, int height,
                           uint64_t *ssd_u, uint64_t *ssd_v );
    void (*ssim_4x4x2_core)( const pixel *pix1, int stride1,
                             const pixel *pix2, int stride2, int sums[2][4] );
    float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );

    /* multiple parallel calls to cmp. */
    x264_pixel_cmp_x3_t sad_x3[7];
    x264_pixel_cmp_x4_t sad_x4[7];
    x264_pixel_cmp_x3_t satd_x3[7];
    x264_pixel_cmp_x4_t satd_x4[7];

    /* abs-diff-sum for successive elimination.
     * may round width up to a multiple of 16. */
    int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
                   uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );

    /* calculate satd or sad of V, H, and DC modes. */
    void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_sad_x3_16x16)  ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_mbcmp_x3_4x4)  ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_satd_x3_4x4)   ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_sad_x3_4x4)    ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_mbcmp_x3_chroma)( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_satd_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_sad_x3_chroma)  ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_mbcmp_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_satd_x3_8x16c)  ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_sad_x3_8x16c)   ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_mbcmp_x3_8x8c)  ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_satd_x3_8x8c)   ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_sad_x3_8x8c)    ( pixel *fenc, pixel *fdec, int res[3] );
    void (*intra_mbcmp_x3_8x8)  ( pixel *fenc, pixel edge[36], int res[3] );
    void (*intra_sa8d_x3_8x8)   ( pixel *fenc, pixel edge[36], int res[3] );
    void (*intra_sad_x3_8x8)    ( pixel *fenc, pixel edge[36], int res[3] );
    /* find minimum satd or sad of all modes, and set fdec.
     * may be NULL, in which case just use pred+satd instead. */
    int (*intra_mbcmp_x9_4x4)( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
    int (*intra_satd_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
    int (*intra_sad_x9_4x4)  ( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
    int (*intra_mbcmp_x9_8x8)( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
    int (*intra_sa8d_x9_8x8) ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
    int (*intra_sad_x9_8x8)  ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
} x264_pixel_function_t;

void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v );
uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf, int *cnt );
int x264_field_vsad( x264_t *h, int mb_x, int mb_y );

#endif
x264-snapshot-20120103-2245-stable/common/pixel.c0000644000175000001440000013306511700673342020456 0ustar  videolanusers/*****************************************************************************
 * pixel.c: pixel metrics
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

#if HAVE_MMX
#   include "x86/pixel.h"
#   include "x86/predict.h"
#endif
#if ARCH_PPC
#   include "ppc/pixel.h"
#endif
#if ARCH_ARM
#   include "arm/pixel.h"
#endif
#if ARCH_UltraSPARC
#   include "sparc/pixel.h"
#endif


/****************************************************************************
 * pixel_sad_WxH
 ****************************************************************************/
#define PIXEL_SAD_C( name, lx, ly ) \
static int name( pixel *pix1, int i_stride_pix1,  \
                 pixel *pix2, int i_stride_pix2 ) \
{                                                   \
    int i_sum = 0;                                  \
    for( int y = 0; y < ly; y++ )                   \
    {                                               \
        for( int x = 0; x < lx; x++ )               \
        {                                           \
            i_sum += abs( pix1[x] - pix2[x] );      \
        }                                           \
        pix1 += i_stride_pix1;                      \
        pix2 += i_stride_pix2;                      \
    }                                               \
    return i_sum;                                   \
}


PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
PIXEL_SAD_C( x264_pixel_sad_16x8,  16,  8 )
PIXEL_SAD_C( x264_pixel_sad_8x16,   8, 16 )
PIXEL_SAD_C( x264_pixel_sad_8x8,    8,  8 )
PIXEL_SAD_C( x264_pixel_sad_8x4,    8,  4 )
PIXEL_SAD_C( x264_pixel_sad_4x16,   4, 16 )
PIXEL_SAD_C( x264_pixel_sad_4x8,    4,  8 )
PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )

/****************************************************************************
 * pixel_ssd_WxH
 ****************************************************************************/
#define PIXEL_SSD_C( name, lx, ly ) \
static int name( pixel *pix1, int i_stride_pix1,  \
                 pixel *pix2, int i_stride_pix2 ) \
{                                                   \
    int i_sum = 0;                                  \
    for( int y = 0; y < ly; y++ )                   \
    {                                               \
        for( int x = 0; x < lx; x++ )               \
        {                                           \
            int d = pix1[x] - pix2[x];              \
            i_sum += d*d;                           \
        }                                           \
        pix1 += i_stride_pix1;                      \
        pix2 += i_stride_pix2;                      \
    }                                               \
    return i_sum;                                   \
}

PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
PIXEL_SSD_C( x264_pixel_ssd_16x8,  16,  8 )
PIXEL_SSD_C( x264_pixel_ssd_8x16,   8, 16 )
PIXEL_SSD_C( x264_pixel_ssd_8x8,    8,  8 )
PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
PIXEL_SSD_C( x264_pixel_ssd_4x16,   4, 16 )
PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )

uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
{
    uint64_t i_ssd = 0;
    int y;
    int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);

#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
                                          pix2 + y*i_pix2 + x, i_pix2 );
    for( y = 0; y < i_height-15; y += 16 )
    {
        int x = 0;
        if( align )
            for( ; x < i_width-15; x += 16 )
                SSD(PIXEL_16x16);
        for( ; x < i_width-7; x += 8 )
            SSD(PIXEL_8x16);
    }
    if( y < i_height-7 )
        for( int x = 0; x < i_width-7; x += 8 )
            SSD(PIXEL_8x8);
#undef SSD

#define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
    if( i_width & 7 )
    {
        for( y = 0; y < (i_height & ~7); y++ )
            for( int x = i_width & ~7; x < i_width; x++ )
                SSD1;
    }
    if( i_height & 7 )
    {
        for( y = i_height & ~7; y < i_height; y++ )
            for( int x = 0; x < i_width; x++ )
                SSD1;
    }
#undef SSD1

    return i_ssd;
}

static void pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
{
    *ssd_u = 0, *ssd_v = 0;
    for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 )
        for( int x = 0; x < width; x++ )
        {
            int du = pixuv1[2*x]   - pixuv2[2*x];
            int dv = pixuv1[2*x+1] - pixuv2[2*x+1];
            *ssd_u += du*du;
            *ssd_v += dv*dv;
        }
}

void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v )
{
    pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v );
    if( i_width&7 )
    {
        uint64_t tmp[2];
        pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height, &tmp[0], &tmp[1] );
        *ssd_u += tmp[0];
        *ssd_v += tmp[1];
    }
}

/****************************************************************************
 * pixel_var_wxh
 ****************************************************************************/
#define PIXEL_VAR_C( name, w, h ) \
static uint64_t name( pixel *pix, int i_stride ) \
{                                             \
    uint32_t sum = 0, sqr = 0;                \
    for( int y = 0; y < h; y++ )              \
    {                                         \
        for( int x = 0; x < w; x++ )          \
        {                                     \
            sum += pix[x];                    \
            sqr += pix[x] * pix[x];           \
        }                                     \
        pix += i_stride;                      \
    }                                         \
    return sum + ((uint64_t)sqr << 32);       \
}

PIXEL_VAR_C( x264_pixel_var_16x16, 16, 16 )
PIXEL_VAR_C( x264_pixel_var_8x16,   8, 16 )
PIXEL_VAR_C( x264_pixel_var_8x8,    8,  8 )

/****************************************************************************
 * pixel_var2_wxh
 ****************************************************************************/
#define PIXEL_VAR2_C( name, w, h, shift ) \
static int name( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) \
{ \
    uint32_t var = 0, sum = 0, sqr = 0; \
    for( int y = 0; y < h; y++ ) \
    { \
        for( int x = 0; x < w; x++ ) \
        { \
            int diff = pix1[x] - pix2[x]; \
            sum += diff; \
            sqr += diff * diff; \
        } \
        pix1 += i_stride1; \
        pix2 += i_stride2; \
    } \
    sum = abs(sum); \
    var = sqr - ((uint64_t)sum * sum >> shift); \
    *ssd = sqr; \
    return var; \
}

PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 )
PIXEL_VAR2_C( x264_pixel_var2_8x8,  8,  8, 6 )

#if BIT_DEPTH > 8
    typedef uint32_t sum_t;
    typedef uint64_t sum2_t;
#else
    typedef uint16_t sum_t;
    typedef uint32_t sum2_t;
#endif
#define BITS_PER_SUM (8 * sizeof(sum_t))

#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
    sum2_t t0 = s0 + s1;\
    sum2_t t1 = s0 - s1;\
    sum2_t t2 = s2 + s3;\
    sum2_t t3 = s2 - s3;\
    d0 = t0 + t2;\
    d2 = t0 - t2;\
    d1 = t1 + t3;\
    d3 = t1 - t3;\
}

// in: a pseudo-simd number of the form x+(y<<16)
// return: abs(x)+(abs(y)<<16)
static ALWAYS_INLINE sum2_t abs2( sum2_t a )
{
    sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<<BITS_PER_SUM)+1))*((sum_t)-1);
    return (a+s)^s;
}

/****************************************************************************
 * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
 ****************************************************************************/

static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
    sum2_t tmp[4][2];
    sum2_t a0, a1, a2, a3, b0, b1;
    sum2_t sum = 0;
    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
    {
        a0 = pix1[0] - pix2[0];
        a1 = pix1[1] - pix2[1];
        b0 = (a0+a1) + ((a0-a1)<<BITS_PER_SUM);
        a2 = pix1[2] - pix2[2];
        a3 = pix1[3] - pix2[3];
        b1 = (a2+a3) + ((a2-a3)<<BITS_PER_SUM);
        tmp[i][0] = b0 + b1;
        tmp[i][1] = b0 - b1;
    }
    for( int i = 0; i < 2; i++ )
    {
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
        sum += ((sum_t)a0) + (a0>>BITS_PER_SUM);
    }
    return sum >> 1;
}

static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
    sum2_t tmp[4][4];
    sum2_t a0, a1, a2, a3;
    sum2_t sum = 0;
    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
    {
        a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
        a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
        a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
        a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
    }
    for( int i = 0; i < 4; i++ )
    {
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
}

#define PIXEL_SATD_C( w, h, sub )\
static int x264_pixel_satd_##w##x##h( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )\
{\
    int sum = sub( pix1, i_pix1, pix2, i_pix2 )\
            + sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\
    if( w==16 )\
        sum+= sub( pix1+8, i_pix1, pix2+8, i_pix2 )\
            + sub( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 );\
    if( h==16 )\
        sum+= sub( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )\
            + sub( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 );\
    if( w==16 && h==16 )\
        sum+= sub( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 )\
            + sub( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2 );\
    return sum;\
}
PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 )
PIXEL_SATD_C( 16, 8,  x264_pixel_satd_8x4 )
PIXEL_SATD_C( 8,  16, x264_pixel_satd_8x4 )
PIXEL_SATD_C( 8,  8,  x264_pixel_satd_8x4 )
PIXEL_SATD_C( 4,  16, x264_pixel_satd_4x4 )
PIXEL_SATD_C( 4,  8,  x264_pixel_satd_4x4 )

static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
    sum2_t tmp[8][4];
    sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
    sum2_t sum = 0;
    for( int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2 )
    {
        a0 = pix1[0] - pix2[0];
        a1 = pix1[1] - pix2[1];
        b0 = (a0+a1) + ((a0-a1)<<BITS_PER_SUM);
        a2 = pix1[2] - pix2[2];
        a3 = pix1[3] - pix2[3];
        b1 = (a2+a3) + ((a2-a3)<<BITS_PER_SUM);
        a4 = pix1[4] - pix2[4];
        a5 = pix1[5] - pix2[5];
        b2 = (a4+a5) + ((a4-a5)<<BITS_PER_SUM);
        a6 = pix1[6] - pix2[6];
        a7 = pix1[7] - pix2[7];
        b3 = (a6+a7) + ((a6-a7)<<BITS_PER_SUM);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0,b1,b2,b3 );
    }
    for( int i = 0; i < 4; i++ )
    {
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
        HADAMARD4( a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i] );
        b0  = abs2(a0+a4) + abs2(a0-a4);
        b0 += abs2(a1+a5) + abs2(a1-a5);
        b0 += abs2(a2+a6) + abs2(a2-a6);
        b0 += abs2(a3+a7) + abs2(a3-a7);
        sum += (sum_t)b0 + (b0>>BITS_PER_SUM);
    }
    return sum;
}

static int x264_pixel_sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
    int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 );
    return (sum+2)>>2;
}

static int x264_pixel_sa8d_16x16( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
    int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 )
            + sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 )
            + sa8d_8x8( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )
            + sa8d_8x8( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 );
    return (sum+2)>>2;
}


static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, int stride )
{
    sum2_t tmp[32];
    sum2_t a0, a1, a2, a3, dc;
    sum2_t sum4 = 0, sum8 = 0;
    for( int i = 0; i < 8; i++, pix+=stride )
    {
        sum2_t *t = tmp + (i&3) + (i&4)*4;
        a0 = (pix[0]+pix[1]) + ((sum2_t)(pix[0]-pix[1])<<BITS_PER_SUM);
        a1 = (pix[2]+pix[3]) + ((sum2_t)(pix[2]-pix[3])<<BITS_PER_SUM);
        t[0] = a0 + a1;
        t[4] = a0 - a1;
        a2 = (pix[4]+pix[5]) + ((sum2_t)(pix[4]-pix[5])<<BITS_PER_SUM);
        a3 = (pix[6]+pix[7]) + ((sum2_t)(pix[6]-pix[7])<<BITS_PER_SUM);
        t[8] = a2 + a3;
        t[12] = a2 - a3;
    }
    for( int i = 0; i < 8; i++ )
    {
        HADAMARD4( a0, a1, a2, a3, tmp[i*4+0], tmp[i*4+1], tmp[i*4+2], tmp[i*4+3] );
        tmp[i*4+0] = a0;
        tmp[i*4+1] = a1;
        tmp[i*4+2] = a2;
        tmp[i*4+3] = a3;
        sum4 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
    for( int i = 0; i < 8; i++ )
    {
        HADAMARD4( a0,a1,a2,a3, tmp[i], tmp[8+i], tmp[16+i], tmp[24+i] );
        sum8 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
    dc = (sum_t)(tmp[0] + tmp[8] + tmp[16] + tmp[24]);
    sum4 = (sum_t)sum4 + (sum4>>BITS_PER_SUM) - dc;
    sum8 = (sum_t)sum8 + (sum8>>BITS_PER_SUM) - dc;
    return ((uint64_t)sum8<<32) + sum4;
}

#define HADAMARD_AC(w,h) \
static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, int stride )\
{\
    uint64_t sum = pixel_hadamard_ac( pix, stride );\
    if( w==16 )\
        sum += pixel_hadamard_ac( pix+8, stride );\
    if( h==16 )\
        sum += pixel_hadamard_ac( pix+8*stride, stride );\
    if( w==16 && h==16 )\
        sum += pixel_hadamard_ac( pix+8*stride+8, stride );\
    return ((sum>>34)<<32) + ((uint32_t)sum>>1);\
}
HADAMARD_AC( 16, 16 )
HADAMARD_AC( 16, 8 )
HADAMARD_AC( 8, 16 )
HADAMARD_AC( 8, 8 )


/****************************************************************************
 * pixel_sad_x4
 ****************************************************************************/
#define SAD_X( size ) \
static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
{\
    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
}\
static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
{\
    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
    scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
}

SAD_X( 16x16 )
SAD_X( 16x8 )
SAD_X( 8x16 )
SAD_X( 8x8 )
SAD_X( 8x4 )
SAD_X( 4x8 )
SAD_X( 4x4 )

#if !HIGH_BIT_DEPTH
#if ARCH_UltraSPARC
SAD_X( 16x16_vis )
SAD_X( 16x8_vis )
SAD_X( 8x16_vis )
SAD_X( 8x8_vis )
#endif
#endif // !HIGH_BIT_DEPTH

/****************************************************************************
 * pixel_satd_x4
 * no faster than single satd, but needed for satd to be a drop-in replacement for sad
 ****************************************************************************/

#define SATD_X( size, cpu ) \
static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
{\
    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
}\
static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
{\
    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
    scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
}
#define SATD_X_DECL6( cpu )\
SATD_X( 16x16, cpu )\
SATD_X( 16x8, cpu )\
SATD_X( 8x16, cpu )\
SATD_X( 8x8, cpu )\
SATD_X( 8x4, cpu )\
SATD_X( 4x8, cpu )
#define SATD_X_DECL7( cpu )\
SATD_X_DECL6( cpu )\
SATD_X( 4x4, cpu )

SATD_X_DECL7()
#if HAVE_MMX
SATD_X_DECL7( _mmx2 )
#if !HIGH_BIT_DEPTH
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL7( _sse4 )
SATD_X_DECL7( _avx )
SATD_X_DECL7( _xop )
#endif // !HIGH_BIT_DEPTH
#endif

#if !HIGH_BIT_DEPTH
#if HAVE_ARMV6
SATD_X_DECL7( _neon )
#endif
#endif // !HIGH_BIT_DEPTH

#define INTRA_MBCMP_8x8( mbcmp, cpu, cpu2 )\
void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[36], int res[3] )\
{\
    ALIGNED_ARRAY_16( pixel, pix, [8*FDEC_STRIDE] );\
    x264_predict_8x8_v##cpu2( pix, edge );\
    res[0] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_8x8_h##cpu2( pix, edge );\
    res[1] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_8x8_dc##cpu2( pix, edge );\
    res[2] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
}

INTRA_MBCMP_8x8( sad,, _c )
INTRA_MBCMP_8x8(sa8d,, _c )
#if HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP_8x8( sad, _mmx2,  _c )
INTRA_MBCMP_8x8( sad, _sse2,  _sse2 )
INTRA_MBCMP_8x8( sad, _ssse3, _sse2 )
INTRA_MBCMP_8x8(sa8d, _sse2,  _sse2 )
#endif

#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
{\
    x264_predict_##size##chroma##_##pred1##cpu2( fdec );\
    res[0] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_##size##chroma##_##pred2##cpu2( fdec );\
    res[1] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_##size##chroma##_##pred3##cpu2( fdec );\
    res[2] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
}

INTRA_MBCMP( sad,  4x4,   v, h, dc,  ,, _c )
INTRA_MBCMP(satd,  4x4,   v, h, dc,  ,, _c )
INTRA_MBCMP( sad,  8x8,  dc, h,  v, c,, _c )
INTRA_MBCMP(satd,  8x8,  dc, h,  v, c,, _c )
INTRA_MBCMP( sad,  8x16, dc, h,  v, c,, _c )
INTRA_MBCMP(satd,  8x16, dc, h,  v, c,, _c )
INTRA_MBCMP( sad, 16x16,  v, h, dc,  ,, _c )
INTRA_MBCMP(satd, 16x16,  v, h, dc,  ,, _c )

#if HAVE_MMX
#if HIGH_BIT_DEPTH
INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _mmx2, _c )
INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _mmx2, _c )
INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _mmx2, _c )
INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _mmx2, _c )
INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _mmx2, _mmx2 )
INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _mmx2, _mmx2 )
INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _sse2, _sse2 )
INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _sse2, _sse2 )
INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _ssse3, _c )
INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _ssse3, _sse2 )
INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _ssse3, _sse2 )
#else
#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx
INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _mmx2, _mmx2 )
INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _mmx2, _mmx2 )
INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _sse2, _mmx2 )
INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _sse2, _mmx2 )
INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _ssse3, _mmx2 )
INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _sse4, _mmx2 )
INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _avx, _mmx2 )
INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _xop, _mmx2 )
#endif
#endif

// No C implementation of intra_satd_x9. See checkasm for its behavior,
// or see x264_mb_analyse_intra for the entirely different algorithm we
// use when lacking an asm implementation of it.



/****************************************************************************
 * structural similarity metric
 ****************************************************************************/
static void ssim_4x4x2_core( const pixel *pix1, int stride1,
                             const pixel *pix2, int stride2,
                             int sums[2][4])
{
    for( int z = 0; z < 2; z++ )
    {
        uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
        for( int y = 0; y < 4; y++ )
            for( int x = 0; x < 4; x++ )
            {
                int a = pix1[x+y*stride1];
                int b = pix2[x+y*stride2];
                s1  += a;
                s2  += b;
                ss  += a*a;
                ss  += b*b;
                s12 += a*b;
            }
        sums[z][0] = s1;
        sums[z][1] = s2;
        sums[z][2] = ss;
        sums[z][3] = s12;
        pix1 += 4;
        pix2 += 4;
    }
}

static float ssim_end1( int s1, int s2, int ss, int s12 )
{
/* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
 * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
 * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
#if BIT_DEPTH > 9
#define type float
    static const float ssim_c1 = .01*.01*PIXEL_MAX*PIXEL_MAX*64;
    static const float ssim_c2 = .03*.03*PIXEL_MAX*PIXEL_MAX*64*63;
#else
#define type int
    static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5);
    static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5);
#endif
    type fs1 = s1;
    type fs2 = s2;
    type fss = ss;
    type fs12 = s12;
    type vars = fss*64 - fs1*fs1 - fs2*fs2;
    type covar = fs12*64 - fs1*fs2;
    return (float)(2*fs1*fs2 + ssim_c1) * (float)(2*covar + ssim_c2)
         / ((float)(fs1*fs1 + fs2*fs2 + ssim_c1) * (float)(vars + ssim_c2));
#undef type
}

static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
{
    float ssim = 0.0;
    for( int i = 0; i < width; i++ )
        ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0],
                           sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1],
                           sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2],
                           sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] );
    return ssim;
}

float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
                           pixel *pix1, int stride1,
                           pixel *pix2, int stride2,
                           int width, int height, void *buf, int *cnt )
{
    int z = 0;
    float ssim = 0.0;
    int (*sum0)[4] = buf;
    int (*sum1)[4] = sum0 + (width >> 2) + 3;
    width >>= 2;
    height >>= 2;
    for( int y = 1; y < height; y++ )
    {
        for( ; z <= y; z++ )
        {
            XCHG( void*, sum0, sum1 );
            for( int x = 0; x < width; x+=2 )
                pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] );
        }
        for( int x = 0; x < width-1; x += 4 )
            ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
    }
    *cnt = (height-1) * (width-1);
    return ssim;
}

static int pixel_vsad( pixel *src, int stride, int height )
{
    int score = 0;
    for( int i = 1; i < height; i++, src += stride )
        for( int j = 0; j < 16; j++ )
            score += abs(src[j] - src[j+stride]);
    return score;
}

int x264_field_vsad( x264_t *h, int mb_x, int mb_y )
{
    int score_field, score_frame;
    int stride = h->fenc->i_stride[0];
    int mb_stride = h->mb.i_mb_stride;
    pixel *fenc = h->fenc->plane[0] + 16 * (mb_x + mb_y * stride);
    int mb_xy = mb_x + mb_y*mb_stride;

    /* We don't want to analyze pixels outside the frame, as it gives inaccurate results. */
    int mbpair_height = X264_MIN( h->param.i_height - mb_y * 16, 32 );
    score_frame  = h->pixf.vsad( fenc,          stride, mbpair_height );
    score_field  = h->pixf.vsad( fenc,        stride*2, mbpair_height >> 1 );
    score_field += h->pixf.vsad( fenc+stride, stride*2, mbpair_height >> 1 );

    if( mb_x > 0 )
        score_field += 512 - h->mb.field[mb_xy        -1]*1024;
    if( mb_y > 0 )
        score_field += 512 - h->mb.field[mb_xy-mb_stride]*1024;

    return (score_field < score_frame);
}

/****************************************************************************
 * successive elimination
 ****************************************************************************/
static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
    int nmv = 0;
    for( int i = 0; i < width; i++, sums++ )
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + abs( enc_dc[1] - sums[8] )
                + abs( enc_dc[2] - sums[delta] )
                + abs( enc_dc[3] - sums[delta+8] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
}

static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
    int nmv = 0;
    for( int i = 0; i < width; i++, sums++ )
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + abs( enc_dc[1] - sums[delta] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
}

static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
    int nmv = 0;
    for( int i = 0; i<width; i++, sums++ )
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
}


/****************************************************************************
 * x264_pixel_init:
 ****************************************************************************/
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
    memset( pixf, 0, sizeof(*pixf) );

#define INIT2_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\
    pixf->name1[PIXEL_16x8]  = x264_pixel_##name2##_16x8##cpu;
#define INIT4_NAME( name1, name2, cpu ) \
    INIT2_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_8x16]  = x264_pixel_##name2##_8x16##cpu;\
    pixf->name1[PIXEL_8x8]   = x264_pixel_##name2##_8x8##cpu;
#define INIT5_NAME( name1, name2, cpu ) \
    INIT4_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_8x4]   = x264_pixel_##name2##_8x4##cpu;
#define INIT6_NAME( name1, name2, cpu ) \
    INIT5_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_4x8]   = x264_pixel_##name2##_4x8##cpu;
#define INIT7_NAME( name1, name2, cpu ) \
    INIT6_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_4x4]   = x264_pixel_##name2##_4x4##cpu;
#define INIT8_NAME( name1, name2, cpu ) \
    INIT7_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_4x16]  = x264_pixel_##name2##_4x16##cpu;
#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
#define INIT8( name, cpu ) INIT8_NAME( name, name, cpu )

#define INIT_ADS( cpu ) \
    pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
    pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
    pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;

    INIT8( sad, );
    INIT8_NAME( sad_aligned, sad, );
    INIT7( sad_x3, );
    INIT7( sad_x4, );
    INIT8( ssd, );
    INIT8( satd, );
    INIT7( satd_x3, );
    INIT7( satd_x4, );
    INIT4( hadamard_ac, );
    INIT_ADS( );

    pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
    pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
    pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
    pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16;
    pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8;
    pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16;
    pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8;

    pixf->ssd_nv12_core = pixel_ssd_nv12_core;
    pixf->ssim_4x4x2_core = ssim_4x4x2_core;
    pixf->ssim_end4 = ssim_end4;
    pixf->vsad = pixel_vsad;

    pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4;
    pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4;
    pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8;
    pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8;
    pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c;
    pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c;
    pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c;
    pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c;
    pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16;
    pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;

#if HIGH_BIT_DEPTH
#if HAVE_MMX
    if( cpu&X264_CPU_MMX2 )
    {
        INIT7( sad, _mmx2 );
        INIT7( sad_x3, _mmx2 );
        INIT7( sad_x4, _mmx2 );
        INIT8( satd, _mmx2 );
        INIT7( satd_x3, _mmx2 );
        INIT7( satd_x4, _mmx2 );
        INIT4( hadamard_ac, _mmx2 );
        INIT8( ssd, _mmx2 );
        INIT_ADS( _mmx2 );

        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
        pixf->var2[PIXEL_8x8]  = x264_pixel_var2_8x8_mmx2;
        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
#endif

        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmx2;
        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmx2;
        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_mmx2;
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_mmx2;
        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmx2;
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_mmx2;
        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
    }
    if( cpu&X264_CPU_SSE2 )
    {
        INIT4_NAME( sad_aligned, sad, _sse2_aligned );
        INIT5( ssd, _sse2 );

        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
#if ARCH_X86_64
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
        pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_sse2;
        pixf->var2[PIXEL_8x8]  = x264_pixel_var2_8x8_sse2;
        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
    }
    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
    {
        INIT5( sad, _sse2 );
        INIT2( sad_x3, _sse2 );
        INIT2( sad_x4, _sse2 );
        INIT_ADS( _sse2 );

        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _sse2 );
        }

        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_sse2;
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_sse2;
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_sse2;
    }
    if( cpu&X264_CPU_SSE2_IS_FAST )
    {
        pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
        pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
        pixf->sad_x3[PIXEL_8x8]  = x264_pixel_sad_x3_8x8_sse2;
        pixf->sad_x3[PIXEL_8x4]  = x264_pixel_sad_x3_8x4_sse2;
        pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
        pixf->sad_x4[PIXEL_8x8]  = x264_pixel_sad_x4_8x8_sse2;
        pixf->sad_x4[PIXEL_8x4]  = x264_pixel_sad_x4_8x4_sse2;
    }
    if( cpu&X264_CPU_SSSE3 )
    {
        INIT4_NAME( sad_aligned, sad, _ssse3_aligned );
        INIT7( sad, _ssse3 );
        INIT7( sad_x3, _ssse3 );
        INIT7( sad_x4, _ssse3 );
        INIT_ADS( _ssse3 );

        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _ssse3 );
        }

        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_ssse3;
        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_ssse3;
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
    }
    if( cpu&X264_CPU_SSE4 )
    {
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _sse4 );
        }
        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
    }
    if( cpu&X264_CPU_AVX )
    {
        INIT_ADS( _avx );
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _avx );
        }
        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_avx;
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx;
        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
        pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
    }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
    if( cpu&X264_CPU_MMX )
    {
        INIT8( ssd, _mmx );
    }

    if( cpu&X264_CPU_MMX2 )
    {
        INIT8( sad, _mmx2 );
        INIT8_NAME( sad_aligned, sad, _mmx2 );
        INIT7( sad_x3, _mmx2 );
        INIT7( sad_x4, _mmx2 );
        INIT8( satd, _mmx2 );
        INIT7( satd_x3, _mmx2 );
        INIT7( satd_x4, _mmx2 );
        INIT4( hadamard_ac, _mmx2 );
        INIT_ADS( _mmx2 );
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_mmx2;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_mmx2;
#if ARCH_X86
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmx2;
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
        pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
        pixf->vsad = x264_pixel_vsad_mmx2;

        if( cpu&X264_CPU_CACHELINE_32 )
        {
            INIT5( sad, _cache32_mmx2 );
            INIT4( sad_x3, _cache32_mmx2 );
            INIT4( sad_x4, _cache32_mmx2 );
        }
        else if( cpu&X264_CPU_CACHELINE_64 )
        {
            INIT5( sad, _cache64_mmx2 );
            INIT4( sad_x3, _cache64_mmx2 );
            INIT4( sad_x4, _cache64_mmx2 );
        }
#else
        if( cpu&X264_CPU_CACHELINE_64 )
        {
            pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
            pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmx2;
            pixf->sad[PIXEL_8x4]  = x264_pixel_sad_8x4_cache64_mmx2;
            pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmx2;
            pixf->sad_x3[PIXEL_8x8]  = x264_pixel_sad_x3_8x8_cache64_mmx2;
            pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmx2;
            pixf->sad_x4[PIXEL_8x8]  = x264_pixel_sad_x4_8x8_cache64_mmx2;
        }
#endif
        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_mmx2;
        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2;
        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_mmx2;
        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmx2;
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_mmx2;
        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_mmx2;
        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmx2;
        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmx2;
    }

    if( cpu&X264_CPU_SSE2 )
    {
        INIT5( ssd, _sse2slow );
        INIT2_NAME( sad_aligned, sad, _sse2_aligned );
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_sse2;
        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
        pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
#if ARCH_X86_64
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_sse2;
        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_sse2;
        pixf->vsad = x264_pixel_vsad_sse2;
    }

    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
    {
        INIT2( sad, _sse2 );
        INIT2( sad_x3, _sse2 );
        INIT2( sad_x4, _sse2 );
        INIT6( satd, _sse2 );
        pixf->satd[PIXEL_4x16]   = x264_pixel_satd_4x16_sse2;
        INIT6( satd_x3, _sse2 );
        INIT6( satd_x4, _sse2 );
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _sse2 );
        }
        INIT_ADS( _sse2 );
        pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
        pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2;
        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_sse2;
        if( cpu&X264_CPU_CACHELINE_64 )
        {
            INIT2( ssd, _sse2); /* faster for width 16 on p4 */
#if ARCH_X86
            INIT2( sad, _cache64_sse2 );
            INIT2( sad_x3, _cache64_sse2 );
            INIT2( sad_x4, _cache64_sse2 );
#endif
           if( cpu&X264_CPU_SSE2_IS_FAST )
           {
               pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2;
               pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
           }
        }

        if( cpu&X264_CPU_SSE_MISALIGN )
        {
            INIT2( sad_x3, _sse2_misalign );
            INIT2( sad_x4, _sse2_misalign );
        }
    }

    if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
    {
        pixf->sad_aligned[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
        pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
        pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
        pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2;
        pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2;
        pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
        pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2;
        pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2;
    }

    if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
    {
        INIT2( sad, _sse3 );
        INIT2( sad_x3, _sse3 );
        INIT2( sad_x4, _sse3 );
    }

    if( cpu&X264_CPU_SSSE3 )
    {
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _ssse3 );
            pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_ssse3;
            pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3;
            pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_ssse3;
#if ARCH_X86_64
            pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
#endif
        }
        INIT_ADS( _ssse3 );
        if( !(cpu&X264_CPU_SLOW_ATOM) )
        {
            INIT8( ssd, _ssse3 );
            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
            INIT8( satd, _ssse3 );
            INIT7( satd_x3, _ssse3 );
            INIT7( satd_x4, _ssse3 );
        }
        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
        if( cpu&X264_CPU_CACHELINE_64 )
        {
            INIT2( sad, _cache64_ssse3 );
            INIT2( sad_x3, _cache64_ssse3 );
            INIT2( sad_x4, _cache64_ssse3 );
        }
        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
        {
            INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
        }
    }

    if( cpu&X264_CPU_SSE4 )
    {
        INIT8( satd, _sse4 );
        INIT7( satd_x3, _sse4 );
        INIT7( satd_x4, _sse4 );
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _sse4 );
            pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_sse4;
            pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4;
            pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_sse4;
#if ARCH_X86_64
            pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_sse4;
#endif
        }
        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
    }

    if( cpu&X264_CPU_AVX )
    {
        INIT8( satd, _avx );
        INIT7( satd_x3, _avx );
        INIT7( satd_x4, _avx );
        INIT_ADS( _avx );
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _avx );
            pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_avx;
            pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
            pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_avx;
#if ARCH_X86_64
            pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_avx;
#endif
        }
        INIT5( ssd, _avx );
        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_avx;
        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx;
        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_avx;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx;
        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
        pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
    }

    if( cpu&X264_CPU_XOP )
    {
        INIT7( satd, _xop );
        INIT7( satd_x3, _xop );
        INIT7( satd_x4, _xop );
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _xop );
            pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
        }
        INIT5( ssd, _xop );
        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_xop;
        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_xop;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_xop;
        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
    }
#endif //HAVE_MMX

#if HAVE_ARMV6
    if( cpu&X264_CPU_ARMV6 )
    {
        pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
        pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
    }
    if( cpu&X264_CPU_NEON )
    {
        INIT5( sad, _neon );
        INIT5( sad_aligned, _neon );
        INIT7( sad_x3, _neon );
        INIT7( sad_x4, _neon );
        INIT7( ssd, _neon );
        INIT7( satd, _neon );
        INIT7( satd_x3, _neon );
        INIT7( satd_x4, _neon );
        INIT4( hadamard_ac, _neon );
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
        pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;

        pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
        pixf->ssim_end4         = x264_pixel_ssim_end4_neon;

        if( cpu&X264_CPU_FAST_NEON_MRC )
        {
            pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon;
            pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon;
            pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon;
            pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon;
        }
        else    // really just scheduled for dual issue / A8
        {
            INIT5( sad_aligned, _neon_dual );
        }
    }
#endif
#endif // HIGH_BIT_DEPTH
#if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
    {
        x264_pixel_altivec_init( pixf );
    }
#endif
#if !HIGH_BIT_DEPTH
#if ARCH_UltraSPARC
    INIT4( sad, _vis );
    INIT4( sad_x3, _vis );
    INIT4( sad_x4, _vis );
#endif
#endif // !HIGH_BIT_DEPTH

    pixf->ads[PIXEL_8x16] =
    pixf->ads[PIXEL_8x4] =
    pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8];
    pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8];
}

x264-snapshot-20120103-2245-stable/common/osdep.h0000644000175000001440000002450611700673342020453 0ustar  videolanusers/*****************************************************************************
 * osdep.h: platform-specific code
 *****************************************************************************
 * Copyright (C) 2007-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_OSDEP_H
#define X264_OSDEP_H

#define _LARGEFILE_SOURCE 1
#define _FILE_OFFSET_BITS 64
#include <stdio.h>
#include <sys/stat.h>

#include "config.h"

#if HAVE_STDINT_H
#include <stdint.h>
#else
#include <inttypes.h>
#endif

#if !HAVE_LOG2F
#define log2f(x) (logf(x)/0.693147180559945f)
#define log2(x) (log(x)/0.693147180559945)
#endif

#ifdef _WIN32
#include <io.h>    // _setmode()
#include <fcntl.h> // _O_BINARY
#endif

#ifdef __ICL
#define inline __inline
#define strcasecmp _stricmp
#define strncasecmp _strnicmp
#define snprintf _snprintf
#define strtok_r strtok_s
#define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
#endif

#ifdef __INTEL_COMPILER
#include <mathimf.h>
#else
#include <math.h>
#endif

#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (ARCH_X86 || ARCH_X86_64)
#define HAVE_X86_INLINE_ASM 1
#endif

#if !defined(isfinite) && (SYS_OPENBSD || SYS_SunOS)
#define isfinite finite
#endif
#ifdef _WIN32
#define rename(src,dst) (unlink(dst), rename(src,dst)) // POSIX says that rename() removes the destination, but win32 doesn't.
#ifndef strtok_r
#define strtok_r(str,delim,save) strtok(str,delim)
#endif
#endif

#ifdef __ICL
#define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
#define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
#define ALIGNED_4( var )  DECLARE_ALIGNED( var, 4 )

// ARM compiliers don't reliably align stack variables
// - EABI requires only 8 byte stack alignment to be maintained
// - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function
// - armcc can't either, but is nice enough to actually tell you so
// - Apple gcc only maintains 4 byte alignment
// - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils...

#define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\
    uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + mask]; \
    type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask)

#if ARCH_ARM && SYS_MACOSX
#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
    ALIGNED_8( type name sub1 __VA_ARGS__ )
#endif

#if ARCH_ARM
#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
    ALIGNED_16( type name sub1 __VA_ARGS__ )
#endif

#define ALIGNED_ARRAY_32( ... ) ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ )
#define ALIGNED_ARRAY_64( ... ) ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ )

#define UNINIT(x) x=x

#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
#define UNUSED __attribute__((unused))
#define ALWAYS_INLINE __attribute__((always_inline)) inline
#define NOINLINE __attribute__((noinline))
#define MAY_ALIAS __attribute__((may_alias))
#define x264_constant_p(x) __builtin_constant_p(x)
#define x264_nonconstant_p(x) (!__builtin_constant_p(x))
#else
#ifdef __ICL
#define ALWAYS_INLINE __forceinline
#define NOINLINE __declspec(noinline)
#else
#define ALWAYS_INLINE inline
#define NOINLINE
#endif
#define UNUSED
#define MAY_ALIAS
#define x264_constant_p(x) 0
#define x264_nonconstant_p(x) 0
#endif

/* threads */
#if HAVE_BEOSTHREAD
#include <kernel/OS.h>
#define x264_pthread_t               thread_id
static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(void *), void *d )
{
     *t = spawn_thread( f, "", 10, d );
     if( *t < B_NO_ERROR )
         return -1;
     resume_thread( *t );
     return 0;
}
#define x264_pthread_join(t,s)       { long tmp; \
                                       wait_for_thread(t,(s)?(long*)(*(s)):&tmp); }

#elif HAVE_POSIXTHREAD
#include <pthread.h>
#define x264_pthread_t               pthread_t
#define x264_pthread_create          pthread_create
#define x264_pthread_join            pthread_join
#define x264_pthread_mutex_t         pthread_mutex_t
#define x264_pthread_mutex_init      pthread_mutex_init
#define x264_pthread_mutex_destroy   pthread_mutex_destroy
#define x264_pthread_mutex_lock      pthread_mutex_lock
#define x264_pthread_mutex_unlock    pthread_mutex_unlock
#define x264_pthread_cond_t          pthread_cond_t
#define x264_pthread_cond_init       pthread_cond_init
#define x264_pthread_cond_destroy    pthread_cond_destroy
#define x264_pthread_cond_broadcast  pthread_cond_broadcast
#define x264_pthread_cond_wait       pthread_cond_wait
#define x264_pthread_attr_t          pthread_attr_t
#define x264_pthread_attr_init       pthread_attr_init
#define x264_pthread_attr_destroy    pthread_attr_destroy
#define x264_pthread_num_processors_np pthread_num_processors_np
#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER

#elif HAVE_WIN32THREAD
#include "win32thread.h"

#else
#define x264_pthread_t               int
#define x264_pthread_create(t,u,f,d) 0
#define x264_pthread_join(t,s)
#endif //HAVE_*THREAD

#if !HAVE_POSIXTHREAD && !HAVE_WIN32THREAD
#define x264_pthread_mutex_t         int
#define x264_pthread_mutex_init(m,f) 0
#define x264_pthread_mutex_destroy(m)
#define x264_pthread_mutex_lock(m)
#define x264_pthread_mutex_unlock(m)
#define x264_pthread_cond_t          int
#define x264_pthread_cond_init(c,f)  0
#define x264_pthread_cond_destroy(c)
#define x264_pthread_cond_broadcast(c)
#define x264_pthread_cond_wait(c,m)
#define x264_pthread_attr_t          int
#define x264_pthread_attr_init(a)    0
#define x264_pthread_attr_destroy(a)
#define X264_PTHREAD_MUTEX_INITIALIZER 0
#endif

#if HAVE_WIN32THREAD || PTW32_STATIC_LIB
int x264_threading_init( void );
#else
#define x264_threading_init() 0
#endif

#define WORD_SIZE sizeof(void*)

#define asm __asm__

#if WORDS_BIGENDIAN
#define endian_fix(x) (x)
#define endian_fix64(x) (x)
#define endian_fix32(x) (x)
#define endian_fix16(x) (x)
#else
#if HAVE_X86_INLINE_ASM && HAVE_MMX
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
    asm("bswap %0":"+r"(x));
    return x;
}
#elif defined(__GNUC__) && HAVE_ARMV6
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
    asm("rev %0, %0":"+r"(x));
    return x;
}
#else
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
    return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24);
}
#endif
#if HAVE_X86_INLINE_ASM && ARCH_X86_64
static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
{
    asm("bswap %0":"+r"(x));
    return x;
}
#else
static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
{
    return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32);
}
#endif
static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
{
    return WORD_SIZE == 8 ? endian_fix64(x) : endian_fix32(x);
}
static ALWAYS_INLINE uint16_t endian_fix16( uint16_t x )
{
    return (x<<8)|(x>>8);
}
#endif

#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3)
#define x264_clz(x) __builtin_clz(x)
#define x264_ctz(x) __builtin_ctz(x)
#else
static int ALWAYS_INLINE x264_clz( uint32_t x )
{
    static uint8_t lut[16] = {4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0};
    int y, z = (((x >> 16) - 1) >> 27) & 16;
    x >>= z^16;
    z += y = ((x - 0x100) >> 28) & 8;
    x >>= y^8;
    z += y = ((x - 0x10) >> 29) & 4;
    x >>= y^4;
    return z + lut[x];
}

static int ALWAYS_INLINE x264_ctz( uint32_t x )
{
    static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
    int y, z = (((x & 0xffff) - 1) >> 27) & 16;
    x >>= z;
    z += y = (((x & 0xff) - 1) >> 28) & 8;
    x >>= y;
    z += y = (((x & 0xf) - 1) >> 29) & 4;
    x >>= y;
    return z + lut[x&0xf];
}
#endif

#if HAVE_X86_INLINE_ASM && HAVE_MMX
/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of
 * using complex address modes properly unless we use inline asm. */
static ALWAYS_INLINE void x264_prefetch( void *p )
{
    asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) );
}
/* We require that prefetch not fault on invalid reads, so we only enable it on
 * known architectures. */
#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1) &&\
      (ARCH_X86 || ARCH_X86_64 || ARCH_ARM || ARCH_PPC)
#define x264_prefetch(x) __builtin_prefetch(x)
#else
#define x264_prefetch(x)
#endif

#if HAVE_POSIXTHREAD
#if SYS_WINDOWS
#define x264_lower_thread_priority(p)\
{\
    x264_pthread_t handle = pthread_self();\
    struct sched_param sp;\
    int policy = SCHED_OTHER;\
    pthread_getschedparam( handle, &policy, &sp );\
    sp.sched_priority -= p;\
    pthread_setschedparam( handle, policy, &sp );\
}
#else
#include <unistd.h>
#define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); }
#endif /* SYS_WINDOWS */
#elif HAVE_WIN32THREAD
#define x264_lower_thread_priority(p) SetThreadPriority( GetCurrentThread(), X264_MAX( -2, -p ) )
#else
#define x264_lower_thread_priority(p)
#endif

static inline uint8_t x264_is_regular_file( FILE *filehandle )
{
    struct stat file_stat;
    if( fstat( fileno( filehandle ), &file_stat ) )
        return -1;
    return S_ISREG( file_stat.st_mode );
}

static inline uint8_t x264_is_regular_file_path( const char *filename )
{
    struct stat file_stat;
    if( stat( filename, &file_stat ) )
        return -1;
    return S_ISREG( file_stat.st_mode );
}

#endif /* X264_OSDEP_H */
x264-snapshot-20120103-2245-stable/common/osdep.c0000644000175000001440000000737011700673342020446 0ustar  videolanusers/*****************************************************************************
 * osdep.c: platform-specific code
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Steven Walters <kemuri9@gmail.com>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

#if SYS_WINDOWS
#include <sys/types.h>
#include <sys/timeb.h>
#else
#include <sys/time.h>
#endif
#include <time.h>

#if PTW32_STATIC_LIB
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
/* this is a global in pthread-win32 to indicate if it has been initialized or not */
extern int ptw32_processInitialized;
#endif

int64_t x264_mdate( void )
{
#if SYS_WINDOWS
    struct timeb tb;
    ftime( &tb );
    return ((int64_t)tb.time * 1000 + (int64_t)tb.millitm) * 1000;
#else
    struct timeval tv_date;
    gettimeofday( &tv_date, NULL );
    return (int64_t)tv_date.tv_sec * 1000000 + (int64_t)tv_date.tv_usec;
#endif
}

#if HAVE_WIN32THREAD || PTW32_STATIC_LIB
/* state of the threading library being initialized */
static volatile LONG x264_threading_is_init = 0;

static void x264_threading_destroy( void )
{
#if PTW32_STATIC_LIB
    pthread_win32_thread_detach_np();
    pthread_win32_process_detach_np();
#else
    x264_win32_threading_destroy();
#endif
}

int x264_threading_init( void )
{
    /* if already init, then do nothing */
    if( InterlockedCompareExchange( &x264_threading_is_init, 1, 0 ) )
        return 0;
#if PTW32_STATIC_LIB
    /* if static pthread-win32 is already initialized, then do nothing */
    if( ptw32_processInitialized )
        return 0;
    if( !pthread_win32_process_attach_np() )
        return -1;
#else
    if( x264_win32_threading_init() )
        return -1;
#endif
    /* register cleanup to run at process termination */
    atexit( x264_threading_destroy );

    return 0;
}
#endif

#ifdef __INTEL_COMPILER
/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
 * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
 * adapted to x264's cpu schema. */

// Global variable indicating cpu
int __intel_cpu_indicator = 0;
// CPU dispatcher function
void __intel_cpu_indicator_init( void )
{
    unsigned int cpu = x264_cpu_detect();
    if( cpu&X264_CPU_AVX )
        __intel_cpu_indicator = 0x20000;
    else if( cpu&X264_CPU_SSE42 )
        __intel_cpu_indicator = 0x8000;
    else if( cpu&X264_CPU_SSE4 )
        __intel_cpu_indicator = 0x2000;
    else if( cpu&X264_CPU_SSSE3 )
        __intel_cpu_indicator = 0x1000;
    else if( cpu&X264_CPU_SSE3 )
        __intel_cpu_indicator = 0x800;
    else if( cpu&X264_CPU_SSE2 && !(cpu&X264_CPU_SSE2_IS_SLOW) )
        __intel_cpu_indicator = 0x200;
    else if( cpu&X264_CPU_SSE )
        __intel_cpu_indicator = 0x80;
    else if( cpu&X264_CPU_MMX2 )
        __intel_cpu_indicator = 8;
    else
        __intel_cpu_indicator = 1;
}
#endif
x264-snapshot-20120103-2245-stable/common/mvpred.c0000644000175000001440000005412111700673342020625 0ustar  videolanusers/*****************************************************************************
 * mvpred.c: motion vector prediction
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
{
    const int i8 = x264_scan8[idx];
    const int i_ref= h->mb.cache.ref[i_list][i8];
    int     i_refa = h->mb.cache.ref[i_list][i8 - 1];
    int16_t *mv_a  = h->mb.cache.mv[i_list][i8 - 1];
    int     i_refb = h->mb.cache.ref[i_list][i8 - 8];
    int16_t *mv_b  = h->mb.cache.mv[i_list][i8 - 8];
    int     i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
    int16_t *mv_c  = h->mb.cache.mv[i_list][i8 - 8 + i_width];

    // Partitions not yet reached in scan order are unavailable.
    if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 )
    {
        i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
        mv_c   = h->mb.cache.mv[i_list][i8 - 8 - 1];

        if( SLICE_MBAFF
            && h->mb.cache.ref[i_list][x264_scan8[0]-1] != -2
            && MB_INTERLACED != h->mb.field[h->mb.i_mb_left_xy[0]] )
        {
            if( idx == 2 )
            {
                mv_c = h->mb.cache.topright_mv[i_list][0];
                i_refc = h->mb.cache.topright_ref[i_list][0];
            }
            else if( idx == 8 )
            {
                mv_c = h->mb.cache.topright_mv[i_list][1];
                i_refc = h->mb.cache.topright_ref[i_list][1];
            }
            else if( idx == 10 )
            {
                mv_c = h->mb.cache.topright_mv[i_list][2];
                i_refc = h->mb.cache.topright_ref[i_list][2];
            }
        }
    }
    if( h->mb.i_partition == D_16x8 )
    {
        if( idx == 0 )
        {
            if( i_refb == i_ref )
            {
                CP32( mvp, mv_b );
                return;
            }
        }
        else
        {
            if( i_refa == i_ref )
            {
                CP32( mvp, mv_a );
                return;
            }
        }
    }
    else if( h->mb.i_partition == D_8x16 )
    {
        if( idx == 0 )
        {
            if( i_refa == i_ref )
            {
                CP32( mvp, mv_a );
                return;
            }
        }
        else
        {
            if( i_refc == i_ref )
            {
                CP32( mvp, mv_c );
                return;
            }
        }
    }

    int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);

    if( i_count > 1 )
    {
median:
        x264_median_mv( mvp, mv_a, mv_b, mv_c );
    }
    else if( i_count == 1 )
    {
        if( i_refa == i_ref )
            CP32( mvp, mv_a );
        else if( i_refb == i_ref )
            CP32( mvp, mv_b );
        else
            CP32( mvp, mv_c );
    }
    else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
        CP32( mvp, mv_a );
    else
        goto median;
}

void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] )
{
    int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
    int16_t *mv_a  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
    int     i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
    int16_t *mv_b  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
    int     i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
    int16_t *mv_c  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
    if( i_refc == -2 )
    {
        i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
        mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
    }

    int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);

    if( i_count > 1 )
    {
median:
        x264_median_mv( mvp, mv_a, mv_b, mv_c );
    }
    else if( i_count == 1 )
    {
        if( i_refa == i_ref )
            CP32( mvp, mv_a );
        else if( i_refb == i_ref )
            CP32( mvp, mv_b );
        else
            CP32( mvp, mv_c );
    }
    else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
        CP32( mvp, mv_a );
    else
        goto median;
}


void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
{
    int     i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1];
    int     i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8];
    int16_t *mv_a  = h->mb.cache.mv[0][X264_SCAN8_0 - 1];
    int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];

    if( i_refa == -2 || i_refb == -2 ||
        !( i_refa | M32( mv_a ) ) ||
        !( i_refb | M32( mv_b ) ) )
    {
        M32( mv ) = 0;
    }
    else
        x264_mb_predict_mv_16x16( h, 0, 0, mv );
}

static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
{
    int mb_x = h->mb.i_mb_x;
    int mb_y = h->mb.i_mb_y;
    int mb_xy = h->mb.i_mb_xy;
    int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] };
    int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] };
    int preshift = MB_INTERLACED;
    int postshift = MB_INTERLACED;
    int offset = 1;
    int yshift = 1;
    h->mb.i_partition = partition_col[0];
    if( PARAM_INTERLACED && h->fref[1][0]->field[mb_xy] != MB_INTERLACED )
    {
        if( MB_INTERLACED )
        {
            mb_y = h->mb.i_mb_y&~1;
            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
            type_col[0] = h->fref[1][0]->mb_type[mb_xy];
            type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride];
            partition_col[0] = h->fref[1][0]->mb_partition[mb_xy];
            partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride];
            preshift = 0;
            yshift = 0;

            if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16) &&
                (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16) &&
                partition_col[0] != D_8x8 )
                h->mb.i_partition = D_16x8;
            else
                h->mb.i_partition = D_8x8;
        }
        else
        {
            int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1];
            int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc)
                          >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc);
            mb_y = (h->mb.i_mb_y&~1) + col_parity;
            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
            type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy];
            partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy];
            preshift = 1;
            yshift = 2;
            h->mb.i_partition = partition_col[0];
        }
        offset = 0;
    }
    int i_mb_4x4 = 16 * h->mb.i_mb_stride * mb_y + 4 * mb_x;
    int i_mb_8x8 =  4 * h->mb.i_mb_stride * mb_y + 2 * mb_x;

    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );

    /* Don't do any checks other than the ones we have to, based
     * on the size of the colocated partitions.
     * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
    int max_i8 = (D_16x16 - h->mb.i_partition) + 1;
    int step = (h->mb.i_partition == D_16x8) + 1;
    int width = 4 >> ((D_16x16 - h->mb.i_partition)&1);
    int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1);
    for( int i8 = 0; i8 < max_i8; i8 += step )
    {
        int x8 = i8&1;
        int y8 = i8>>1;
        int ypart = (SLICE_MBAFF && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ?
                    MB_INTERLACED ? y8*6 : 2*(h->mb.i_mb_y&1) + y8 :
                    3*y8;

        if( IS_INTRA( type_col[y8] ) )
        {
            x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, 0 );
            x264_macroblock_cache_mv(  h, 2*x8, 2*y8, width, height, 0, 0 );
            x264_macroblock_cache_mv(  h, 2*x8, 2*y8, width, height, 1, 0 );
            continue;
        }

        int i_part_8x8 = i_mb_8x8 + x8 + (ypart>>1) * h->mb.i_b8_stride;
        int i_ref1_ref = h->fref[1][0]->ref[0][i_part_8x8];
        int i_ref = (map_col_to_list0(i_ref1_ref>>preshift) << postshift) + (offset&i_ref1_ref&MB_INTERLACED);

        if( i_ref >= 0 )
        {
            int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0];
            int16_t *mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3*x8 + ypart * h->mb.i_b4_stride];
            int16_t mv_y = (mv_col[1]<<yshift)/2;
            int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
            int l0y = ( dist_scale_factor * mv_y + 128 ) >> 8;
            if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_y > h->mb.mv_max_spel[1]) )
                return 0;
            x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref );
            x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) );
            x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_y) );
        }
        else
        {
            /* the collocated ref isn't in the current list0 */
            /* FIXME: we might still be able to use direct_8x8 on some partitions */
            /* FIXME: with B-pyramid + extensive ref list reordering
             *   (not currently used), we would also have to check
             *   l1mv1 like in spatial mode */
            return 0;
        }
    }

    return 1;
}

static ALWAYS_INLINE int x264_mb_predict_mv_direct16x16_spatial( x264_t *h, int b_interlaced )
{
    int8_t ref[2];
    ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
    for( int i_list = 0; i_list < 2; i_list++ )
    {
        int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
        int16_t *mv_a  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
        int     i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
        int16_t *mv_b  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
        int     i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
        int16_t *mv_c  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
        if( i_refc == -2 )
        {
            i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
            mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
        }

        int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
        if( i_ref < 0 )
        {
            i_ref = -1;
            M32( mv[i_list] ) = 0;
        }
        else
        {
            /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases
             * not relevant to spatial direct. */
            int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);

            if( i_count > 1 )
                x264_median_mv( mv[i_list], mv_a, mv_b, mv_c );
            else
            {
                if( i_refa == i_ref )
                    CP32( mv[i_list], mv_a );
                else if( i_refb == i_ref )
                    CP32( mv[i_list], mv_b );
                else
                    CP32( mv[i_list], mv_c );
            }
        }

        x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref );
        x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] );
        ref[i_list] = i_ref;
    }

    int mb_x = h->mb.i_mb_x;
    int mb_y = h->mb.i_mb_y;
    int mb_xy = h->mb.i_mb_xy;
    int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] };
    int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] };
    h->mb.i_partition = partition_col[0];
    if( b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED )
    {
        if( MB_INTERLACED )
        {
            mb_y = h->mb.i_mb_y&~1;
            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
            type_col[0] = h->fref[1][0]->mb_type[mb_xy];
            type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride];
            partition_col[0] = h->fref[1][0]->mb_partition[mb_xy];
            partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride];

            if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16) &&
                (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16) &&
                partition_col[0] != D_8x8 )
                h->mb.i_partition = D_16x8;
            else
                h->mb.i_partition = D_8x8;
        }
        else
        {
            int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1];
            int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc)
                          >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc);
            mb_y = (h->mb.i_mb_y&~1) + col_parity;
            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
            type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy];
            partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy];
            h->mb.i_partition = partition_col[0];
        }
    }
    int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stride*mb_y + mb_x) : h->mb.i_b4_xy ;
    int i_mb_8x8 = b_interlaced ? 2 * (h->mb.i_b8_stride*mb_y + mb_x) : h->mb.i_b8_xy ;

    int8_t *l1ref0 = &h->fref[1][0]->ref[0][i_mb_8x8];
    int8_t *l1ref1 = &h->fref[1][0]->ref[1][i_mb_8x8];
    int16_t (*l1mv[2])[2] = { (int16_t (*)[2]) &h->fref[1][0]->mv[0][i_mb_4x4],
                              (int16_t (*)[2]) &h->fref[1][0]->mv[1][i_mb_4x4] };

    if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */
    {
        x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
        x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
        return 1;
    }

    if( h->param.i_threads > 1
        && ( mv[0][1] > h->mb.mv_max_spel[1]
          || mv[1][1] > h->mb.mv_max_spel[1] ) )
    {
#if 0
        fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n",
                mv[0][0], mv[0][1], mv[1][0], mv[1][1],
                h->mb.mv_max_spel[1]);
#endif
        return 0;
    }

    if( !M64( mv ) || (!b_interlaced && IS_INTRA( type_col[0] )) || (ref[0]&&ref[1]) )
        return 1;

    /* Don't do any checks other than the ones we have to, based
     * on the size of the colocated partitions.
     * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
    int max_i8 = (D_16x16 - h->mb.i_partition) + 1;
    int step = (h->mb.i_partition == D_16x8) + 1;
    int width = 4 >> ((D_16x16 - h->mb.i_partition)&1);
    int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1);

    /* col_zero_flag */
    for( int i8 = 0; i8 < max_i8; i8 += step )
    {
        const int x8 = i8&1;
        const int y8 = i8>>1;
        int ypart = (b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ?
                    MB_INTERLACED ? y8*6 : 2*(h->mb.i_mb_y&1) + y8 :
                    3*y8;
        int o8 = x8 + (ypart>>1) * h->mb.i_b8_stride;
        int o4 = 3*x8 + ypart * h->mb.i_b4_stride;

        if( b_interlaced && IS_INTRA( type_col[y8] ) )
            continue;

        int idx;
        if( l1ref0[o8] == 0 )
            idx = 0;
        else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 )
            idx = 1;
        else
            continue;

        if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
        {
            if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 );
            if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 );
        }
    }

    return 1;
}


static int x264_mb_predict_mv_direct16x16_spatial_interlaced( x264_t *h )
{
    return x264_mb_predict_mv_direct16x16_spatial( h, 1 );
}

static int x264_mb_predict_mv_direct16x16_spatial_progressive( x264_t *h )
{
    return x264_mb_predict_mv_direct16x16_spatial( h, 0 );
}

int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
{
    int b_available;
    if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE )
        return 0;
    else if( h->sh.b_direct_spatial_mv_pred )
    {
        if( SLICE_MBAFF )
            b_available = x264_mb_predict_mv_direct16x16_spatial_interlaced( h );
        else
            b_available = x264_mb_predict_mv_direct16x16_spatial_progressive( h );
    }
    else
        b_available = x264_mb_predict_mv_direct16x16_temporal( h );

    if( b_changed != NULL && b_available )
    {
        int changed;

        changed  = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] );
        changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] );
        changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]];
        changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]];
        if( !changed && h->mb.i_partition != D_16x16 )
        {
            changed |= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] );
            changed |= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] );
            changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]];
            changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]];
        }
        if( !changed && h->mb.i_partition == D_8x8 )
        {
            changed |= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] );
            changed |= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] );
            changed |= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] );
            changed |= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] );
            changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]];
            changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]];
            changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]];
            changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]];
        }
        *b_changed = changed;
        if( !changed )
            return b_available;
    }

    /* cache ref & mv */
    if( b_available )
        for( int l = 0; l < 2; l++ )
        {
            CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] );
            CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] );
            CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] );
            CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] );
            h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]];
            h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
            h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
            h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
            h->mb.cache.direct_partition = h->mb.i_partition;
        }

    return b_available;
}

/* This just improves encoder performance, it's not part of the spec */
void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
{
    int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
    int i = 0;

#define SET_MVP(mvp) \
    { \
        CP32( mvc[i], mvp ); \
        i++; \
    }

#define SET_IMVP(xy) \
    if( xy >= 0 ) \
    { \
        int shift = 1 + MB_INTERLACED - h->mb.field[xy]; \
        int16_t *mvp = h->mb.mvr[i_list][i_ref<<1>>shift][xy]; \
        mvc[i][0] = mvp[0]; \
        mvc[i][1] = mvp[1]<<1>>shift; \
        i++; \
    }

    /* b_direct */
    if( h->sh.i_type == SLICE_TYPE_B
        && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref )
    {
        SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] );
    }

    if( i_ref == 0 && h->frames.b_have_lowres )
    {
        int idx = i_list ? h->fref[1][0]->i_frame-h->fenc->i_frame-1
                         : h->fenc->i_frame-h->fref[0][0]->i_frame-1;
        if( idx <= h->param.i_bframe )
        {
            int16_t (*lowres_mv)[2] = h->fenc->lowres_mvs[i_list][idx];
            if( lowres_mv[0][0] != 0x7fff )
            {
                M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
                i++;
            }
        }
    }

    /* spatial predictors */
    if( SLICE_MBAFF )
    {
        SET_IMVP( h->mb.i_mb_left_xy[0] );
        SET_IMVP( h->mb.i_mb_top_xy );
        SET_IMVP( h->mb.i_mb_topleft_xy );
        SET_IMVP( h->mb.i_mb_topright_xy );
    }
    else
    {
        SET_MVP( mvr[h->mb.i_mb_left_xy[0]] );
        SET_MVP( mvr[h->mb.i_mb_top_xy] );
        SET_MVP( mvr[h->mb.i_mb_topleft_xy] );
        SET_MVP( mvr[h->mb.i_mb_topright_xy] );
    }
#undef SET_IMVP
#undef SET_MVP

    /* temporal predictors */
    if( h->fref[0][0]->i_ref[0] > 0 )
    {
        x264_frame_t *l0 = h->fref[0][0];
        int field = h->mb.i_mb_y&1;
        int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
        int refpoc = h->fref[i_list][i_ref>>SLICE_MBAFF]->i_poc;
        refpoc += l0->i_delta_poc[field^(i_ref&1)];

#define SET_TMVP( dx, dy ) \
        { \
            int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \
            int scale = (curpoc - refpoc) * l0->inv_ref_poc[MB_INTERLACED&field]; \
            mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8; \
            mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8; \
            i++; \
        }

        SET_TMVP(0,0);
        if( h->mb.i_mb_x < h->mb.i_mb_width-1 )
            SET_TMVP(1,0);
        if( h->mb.i_mb_y < h->mb.i_mb_height-1 )
            SET_TMVP(0,1);
#undef SET_TMVP
    }

    *i_mvc = i;
}
x264-snapshot-20120103-2245-stable/common/mc.h0000644000175000001440000001370511700673342017737 0ustar  videolanusers/*****************************************************************************
 * mc.h: motion compensation
 *****************************************************************************
 * Copyright (C) 2004-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_MC_H
#define X264_MC_H

struct x264_weight_t;
typedef void (* weight_fn_t)( pixel *, int, pixel *,int, const struct x264_weight_t *, int );
typedef struct x264_weight_t
{
    /* aligning the first member is a gcc hack to force the struct to be
     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
    ALIGNED_16( int16_t cachea[8] );
    int16_t cacheb[8];
    int32_t i_denom;
    int32_t i_scale;
    int32_t i_offset;
    weight_fn_t *weightfn;
} ALIGNED_16( x264_weight_t );

extern const x264_weight_t x264_weight_none[3];

#define SET_WEIGHT( w, b, s, d, o )\
{\
    (w).i_scale = (s);\
    (w).i_denom = (d);\
    (w).i_offset = (o);\
    if( b )\
        h->mc.weight_cache( h, &w );\
    else\
        w.weightfn = NULL;\
}

/* Do the MC
 * XXX: Only width = 4, 8 or 16 are valid
 * width == 4 -> height == 4 or 8
 * width == 8 -> height == 4 or 8 or 16
 * width == 16-> height == 8 or 16
 * */

typedef struct
{
    void (*mc_luma)( pixel *dst, int i_dst, pixel **src, int i_src,
                     int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );

    /* may round up the dimensions if they're not a power of 2 */
    pixel* (*get_ref)( pixel *dst, int *i_dst, pixel **src, int i_src,
                       int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );

    /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
     * so it must be run from left to right. */
    void (*mc_chroma)( pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
                       int mvx, int mvy, int i_width, int i_height );

    void (*avg[12])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );

    /* only 16x16, 8x8, and 4x4 defined */
    void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height );
    void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height );

    void (*store_interleave_chroma)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
    void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, int i_src, int height );
    void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, int i_src, int height );

    void (*plane_copy)( pixel *dst, int i_dst,
                        pixel *src, int i_src, int w, int h );
    void (*plane_copy_interleave)( pixel *dst, int i_dst,
                                   pixel *srcu, int i_srcu,
                                   pixel *srcv, int i_srcv, int w, int h );
    /* may write up to 15 pixels off the end of each plane */
    void (*plane_copy_deinterleave)( pixel *dstu, int i_dstu,
                                     pixel *dstv, int i_dstv,
                                     pixel *src, int i_src, int w, int h );
    void (*plane_copy_deinterleave_rgb)( pixel *dsta, int i_dsta,
                                         pixel *dstb, int i_dstb,
                                         pixel *dstc, int i_dstc,
                                         pixel *src, int i_src, int pw, int w, int h );
    void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
                         int i_stride, int i_width, int i_height, int16_t *buf );

    /* prefetch the next few macroblocks of fenc or fdec */
    void (*prefetch_fenc)( pixel *pix_y, int stride_y,
                           pixel *pix_uv, int stride_uv, int mb_x );
    void (*prefetch_fenc_420)( pixel *pix_y, int stride_y,
                               pixel *pix_uv, int stride_uv, int mb_x );
    void (*prefetch_fenc_422)( pixel *pix_y, int stride_y,
                               pixel *pix_uv, int stride_uv, int mb_x );
    /* prefetch the next few macroblocks of a hpel reference frame */
    void (*prefetch_ref)( pixel *pix, int stride, int parity );

    void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
    void (*memzero_aligned)( void *dst, int n );

    /* successive elimination prefilter */
    void (*integral_init4h)( uint16_t *sum, pixel *pix, int stride );
    void (*integral_init8h)( uint16_t *sum, pixel *pix, int stride );
    void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
    void (*integral_init8v)( uint16_t *sum8, int stride );

    void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
                                    int src_stride, int dst_stride, int width, int height );
    weight_fn_t *weight;
    weight_fn_t *offsetadd;
    weight_fn_t *offsetsub;
    void (*weight_cache)( x264_t *, x264_weight_t * );

    void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
} x264_mc_functions_t;

void x264_mc_init( int cpu, x264_mc_functions_t *pf );

#endif
x264-snapshot-20120103-2245-stable/common/mc.c0000644000175000001440000005222511700673342017732 0ustar  videolanusers/*****************************************************************************
 * mc.c: motion compensation
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

#if HAVE_MMX
#include "x86/mc.h"
#endif
#if ARCH_PPC
#include "ppc/mc.h"
#endif
#if ARCH_ARM
#include "arm/mc.h"
#endif


static inline void pixel_avg( pixel *dst,  int i_dst_stride,
                              pixel *src1, int i_src1_stride,
                              pixel *src2, int i_src2_stride,
                              int i_width, int i_height )
{
    for( int y = 0; y < i_height; y++ )
    {
        for( int x = 0; x < i_width; x++ )
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        dst  += i_dst_stride;
        src1 += i_src1_stride;
        src2 += i_src2_stride;
    }
}

static inline void pixel_avg_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height )
{
    for( int y = 0; y < height; y++ )
    {
        for( int x = 0; x < width; x++ )
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        src1 += i_src1;
        src2 += i_src2;
        dst += i_dst;
    }
}

/* Implicit weighted bipred only:
 * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
static inline void pixel_avg_weight_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height, int i_weight1 )
{
    const int i_weight2 = 64 - i_weight1;
    for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
        for( int x = 0; x<width; x++ )
            dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
}
#undef op_scale2

#define PIXEL_AVG_C( name, width, height ) \
static void name( pixel *pix1, int i_stride_pix1, \
                  pixel *pix2, int i_stride_pix2, \
                  pixel *pix3, int i_stride_pix3, int weight ) \
{ \
    if( weight == 32 ) \
        pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
    else \
        pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
}
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
PIXEL_AVG_C( pixel_avg_4x16,  4, 16 )
PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
PIXEL_AVG_C( pixel_avg_2x8,   2, 8 )
PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )

static void x264_weight_cache( x264_t *h, x264_weight_t *w )
{
    w->weightfn = h->mc.weight;
}
#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
{
    int offset = weight->i_offset << (BIT_DEPTH-8);
    int scale = weight->i_scale;
    int denom = weight->i_denom;
    if( denom >= 1 )
    {
        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
            for( int x = 0; x < i_width; x++ )
                opscale( x );
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
            for( int x = 0; x < i_width; x++ )
                opscale_noden( x );
    }
}

#define MC_WEIGHT_C( name, width ) \
    static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \
{ \
    mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
}

MC_WEIGHT_C( mc_weight_w20, 20 )
MC_WEIGHT_C( mc_weight_w16, 16 )
MC_WEIGHT_C( mc_weight_w12, 12 )
MC_WEIGHT_C( mc_weight_w8,   8 )
MC_WEIGHT_C( mc_weight_w4,   4 )
MC_WEIGHT_C( mc_weight_w2,   2 )

static weight_fn_t x264_mc_weight_wtab[6] =
{
    mc_weight_w2,
    mc_weight_w4,
    mc_weight_w8,
    mc_weight_w12,
    mc_weight_w16,
    mc_weight_w20,
};
const x264_weight_t x264_weight_none[3] = { {{0}} };
static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, int i_width, int i_height )
{
    for( int y = 0; y < i_height; y++ )
    {
        memcpy( dst, src, i_width * sizeof(pixel) );

        src += i_src_stride;
        dst += i_dst_stride;
    }
}

#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
                         int stride, int width, int height, int16_t *buf )
{
    const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
    for( int y = 0; y < height; y++ )
    {
        for( int x = -2; x < width+3; x++ )
        {
            int v = TAPFILTER(src,stride);
            dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
            /* transform v for storage in a 16-bit integer */
            buf[x+2] = v + pad;
        }
        for( int x = 0; x < width; x++ )
            dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 );
        for( int x = 0; x < width; x++ )
            dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
        dsth += stride;
        dstv += stride;
        dstc += stride;
        src += stride;
    }
}

static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};

static void mc_luma( pixel *dst,    int i_dst_stride,
                     pixel *src[4], int i_src_stride,
                     int mvx, int mvy,
                     int i_width, int i_height, const x264_weight_t *weight )
{
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;

    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        pixel_avg( dst, i_dst_stride, src1, i_src_stride,
                   src2, i_src_stride, i_width, i_height );
        if( weight->weightfn )
            mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
    }
    else if( weight->weightfn )
        mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
    else
        mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
}

static pixel *get_ref( pixel *dst,   int *i_dst_stride,
                       pixel *src[4], int i_src_stride,
                       int mvx, int mvy,
                       int i_width, int i_height, const x264_weight_t *weight )
{
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;

    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
                   src2, i_src_stride, i_width, i_height );
        if( weight->weightfn )
            mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
}

/* full chroma mc (ie until 1/8 pixel)*/
static void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride,
                       pixel *src, int i_src_stride,
                       int mvx, int mvy,
                       int i_width, int i_height )
{
    pixel *srcp;

    int d8x = mvx&0x07;
    int d8y = mvy&0x07;
    int cA = (8-d8x)*(8-d8y);
    int cB = d8x    *(8-d8y);
    int cC = (8-d8x)*d8y;
    int cD = d8x    *d8y;

    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
    srcp = &src[i_src_stride];

    for( int y = 0; y < i_height; y++ )
    {
        for( int x = 0; x < i_width; x++ )
        {
            dstu[x] = ( cA*src[2*x]  + cB*src[2*x+2] +
                        cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
            dstv[x] = ( cA*src[2*x+1]  + cB*src[2*x+3] +
                        cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
        }
        dstu += i_dst_stride;
        dstv += i_dst_stride;
        src   = srcp;
        srcp += i_src_stride;
    }
}

#define MC_COPY(W) \
static void mc_copy_w##W( pixel *dst, int i_dst, pixel *src, int i_src, int i_height ) \
{ \
    mc_copy( src, i_src, dst, i_dst, W, i_height ); \
}
MC_COPY( 16 )
MC_COPY( 8 )
MC_COPY( 4 )

void x264_plane_copy_c( pixel *dst, int i_dst,
                        pixel *src, int i_src, int w, int h )
{
    while( h-- )
    {
        memcpy( dst, src, w * sizeof(pixel) );
        dst += i_dst;
        src += i_src;
    }
}

void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
                                   pixel *srcu, int i_srcu,
                                   pixel *srcv, int i_srcv, int w, int h )
{
    for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
        for( int x=0; x<w; x++ )
        {
            dst[2*x]   = srcu[x];
            dst[2*x+1] = srcv[x];
        }
}

static void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
                                            pixel *dstv, int i_dstv,
                                            pixel *src, int i_src, int w, int h )
{
    for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
        for( int x=0; x<w; x++ )
        {
            dstu[x] = src[2*x];
            dstv[x] = src[2*x+1];
        }
}

static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
                                                pixel *dstb, int i_dstb,
                                                pixel *dstc, int i_dstc,
                                                pixel *src, int i_src, int pw, int w, int h )
{
    for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
    {
        for( int x=0; x<w; x++ )
        {
            dsta[x] = src[x*pw];
            dstb[x] = src[x*pw+1];
            dstc[x] = src[x*pw+2];
        }
    }
}

static void store_interleave_chroma( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height )
{
    for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
        for( int x=0; x<8; x++ )
        {
            dst[2*x]   = srcu[x];
            dst[2*x+1] = srcv[x];
        }
}

static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
{
    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
}

static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
{
    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
}

static void prefetch_fenc_null( pixel *pix_y, int stride_y,
                                pixel *pix_uv, int stride_uv, int mb_x )
{}

static void prefetch_ref_null( pixel *pix, int stride, int parity )
{}

static void memzero_aligned( void * dst, int n )
{
    memset( dst, 0, n );
}

static void integral_init4h( uint16_t *sum, pixel *pix, int stride )
{
    int v = pix[0]+pix[1]+pix[2]+pix[3];
    for( int x = 0; x < stride-4; x++ )
    {
        sum[x] = v + sum[x-stride];
        v += pix[x+4] - pix[x];
    }
}

static void integral_init8h( uint16_t *sum, pixel *pix, int stride )
{
    int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
    for( int x = 0; x < stride-8; x++ )
    {
        sum[x] = v + sum[x-stride];
        v += pix[x+8] - pix[x];
    }
}

static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
{
    for( int x = 0; x < stride-8; x++ )
        sum4[x] = sum8[x+4*stride] - sum8[x];
    for( int x = 0; x < stride-8; x++ )
        sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
}

static void integral_init8v( uint16_t *sum8, int stride )
{
    for( int x = 0; x < stride-8; x++ )
        sum8[x] = sum8[x+8*stride] - sum8[x];
}

void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
    pixel *src = frame->plane[0];
    int i_stride = frame->i_stride[0];
    int i_height = frame->i_lines[0];
    int i_width  = frame->i_width[0];

    // duplicate last row and column so that their interpolation doesn't have to be special-cased
    for( int y = 0; y < i_height; y++ )
        src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
    memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * sizeof(pixel) );
    h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
                                  i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
    x264_frame_expand_border_lowres( frame );

    memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );

    for( int y = 0; y < h->param.i_bframe + 2; y++ )
        for( int x = 0; x < h->param.i_bframe + 2; x++ )
            frame->i_row_satds[y][x][0] = -1;

    for( int y = 0; y <= !!h->param.i_bframe; y++ )
        for( int x = 0; x <= h->param.i_bframe; x++ )
            frame->lowres_mvs[y][x][0][0] = 0x7FFF;
}

static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
                                    int src_stride, int dst_stride, int width, int height )
{
    for( int y = 0; y < height; y++ )
    {
        pixel *src1 = src0+src_stride;
        pixel *src2 = src1+src_stride;
        for( int x = 0; x<width; x++ )
        {
            // slower than naive bilinear, but matches asm
#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
            dst0[x] = FILTER(src0[2*x  ], src1[2*x  ], src0[2*x+1], src1[2*x+1]);
            dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
            dstv[x] = FILTER(src1[2*x  ], src2[2*x  ], src1[2*x+1], src2[2*x+1]);
            dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
#undef FILTER
        }
        src0 += src_stride*2;
        dst0 += dst_stride;
        dsth += dst_stride;
        dstv += dst_stride;
        dstc += dst_stride;
    }
}

/* Estimate the total amount of influence on future quality that could be had if we
 * were to improve the reference samples used to inter predict any given macroblock. */
static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
{
    float fps = *fps_factor / 256.f;
    for( int i = 0; i < len; i++ )
    {
        float intra_cost       = intra_costs[i] * inv_qscales[i];
        float propagate_amount = propagate_in[i] + intra_cost*fps;
        float propagate_num    = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
        float propagate_denom  = intra_costs[i];
        dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
    }
}

void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
    pf->mc_luma   = mc_luma;
    pf->get_ref   = get_ref;

    pf->mc_chroma = mc_chroma;

    pf->avg[PIXEL_16x16]= pixel_avg_16x16;
    pf->avg[PIXEL_16x8] = pixel_avg_16x8;
    pf->avg[PIXEL_8x16] = pixel_avg_8x16;
    pf->avg[PIXEL_8x8]  = pixel_avg_8x8;
    pf->avg[PIXEL_8x4]  = pixel_avg_8x4;
    pf->avg[PIXEL_4x16] = pixel_avg_4x16;
    pf->avg[PIXEL_4x8]  = pixel_avg_4x8;
    pf->avg[PIXEL_4x4]  = pixel_avg_4x4;
    pf->avg[PIXEL_4x2]  = pixel_avg_4x2;
    pf->avg[PIXEL_2x8]  = pixel_avg_2x8;
    pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
    pf->avg[PIXEL_2x2]  = pixel_avg_2x2;

    pf->weight    = x264_mc_weight_wtab;
    pf->offsetadd = x264_mc_weight_wtab;
    pf->offsetsub = x264_mc_weight_wtab;
    pf->weight_cache = x264_weight_cache;

    pf->copy_16x16_unaligned = mc_copy_w16;
    pf->copy[PIXEL_16x16] = mc_copy_w16;
    pf->copy[PIXEL_8x8]   = mc_copy_w8;
    pf->copy[PIXEL_4x4]   = mc_copy_w4;

    pf->store_interleave_chroma       = store_interleave_chroma;
    pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
    pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;

    pf->plane_copy = x264_plane_copy_c;
    pf->plane_copy_interleave = x264_plane_copy_interleave_c;
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;

    pf->hpel_filter = hpel_filter;

    pf->prefetch_fenc_420 = prefetch_fenc_null;
    pf->prefetch_fenc_422 = prefetch_fenc_null;
    pf->prefetch_ref  = prefetch_ref_null;
    pf->memcpy_aligned = memcpy;
    pf->memzero_aligned = memzero_aligned;
    pf->frame_init_lowres_core = frame_init_lowres_core;

    pf->integral_init4h = integral_init4h;
    pf->integral_init8h = integral_init8h;
    pf->integral_init4v = integral_init4v;
    pf->integral_init8v = integral_init8v;

    pf->mbtree_propagate_cost = mbtree_propagate_cost;

#if HAVE_MMX
    x264_mc_init_mmx( cpu, pf );
#endif
#if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
        x264_mc_altivec_init( pf );
#endif
#if HAVE_ARMV6
    x264_mc_init_arm( cpu, pf );
#endif
}

void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
    const int b_interlaced = PARAM_INTERLACED;
    int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
    int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;

    if( mb_y & b_interlaced )
        return;

    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
    {
        int stride = frame->i_stride[p];
        const int width = frame->i_width[p];
        int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd

        if( !b_interlaced || h->mb.b_adaptive_mbaff )
            h->mc.hpel_filter(
                frame->filtered[p][1] + offs,
                frame->filtered[p][2] + offs,
                frame->filtered[p][3] + offs,
                frame->plane[p] + offs,
                stride, width + 16, height - start,
                h->scratch_buffer );

        if( b_interlaced )
        {
            /* MC must happen between pixels in the same field. */
            stride = frame->i_stride[p] << 1;
            start = (mb_y*16 >> 1) - 8;
            int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
            offs = start*stride - 8;
            for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
            {
                h->mc.hpel_filter(
                    frame->filtered_fld[p][1] + offs,
                    frame->filtered_fld[p][2] + offs,
                    frame->filtered_fld[p][3] + offs,
                    frame->plane_fld[p] + offs,
                    stride, width + 16, height_fld - start,
                    h->scratch_buffer );
            }
        }
    }

    /* generate integral image:
     * frame->integral contains 2 planes. in the upper plane, each element is
     * the sum of an 8x8 pixel region with top-left corner on that point.
     * in the lower plane, 4x4 sums (needed only with --partitions p4x4). */

    if( frame->integral )
    {
        int stride = frame->i_stride[0];
        if( start < 0 )
        {
            memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) );
            start = -PADV;
        }
        if( b_end )
            height += PADV-9;
        for( int y = start; y < height; y++ )
        {
            pixel    *pix  = frame->plane[0] + y * stride - PADH;
            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
            uint16_t *sum4;
            if( h->frames.b_have_sub8x8_esa )
            {
                h->mc.integral_init4h( sum8, pix, stride );
                sum8 -= 8*stride;
                sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
                if( y >= 8-PADV )
                    h->mc.integral_init4v( sum8, sum4, stride );
            }
            else
            {
                h->mc.integral_init8h( sum8, pix, stride );
                if( y >= 8-PADV )
                    h->mc.integral_init8v( sum8-8*stride, stride );
            }
        }
    }
}
x264-snapshot-20120103-2245-stable/common/macroblock.h0000644000175000001440000003512711700673342021456 0ustar  videolanusers/*****************************************************************************
 * macroblock.h: macroblock common functions
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_MACROBLOCK_H
#define X264_MACROBLOCK_H

enum macroblock_position_e
{
    MB_LEFT     = 0x01,
    MB_TOP      = 0x02,
    MB_TOPRIGHT = 0x04,
    MB_TOPLEFT  = 0x08,

    MB_PRIVATE  = 0x10,

    ALL_NEIGHBORS = 0xf,
};

static const uint8_t x264_pred_i4x4_neighbors[12] =
{
    MB_TOP,                         // I_PRED_4x4_V
    MB_LEFT,                        // I_PRED_4x4_H
    MB_LEFT | MB_TOP,               // I_PRED_4x4_DC
    MB_TOP  | MB_TOPRIGHT,          // I_PRED_4x4_DDL
    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_DDR
    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_VR
    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_HD
    MB_TOP  | MB_TOPRIGHT,          // I_PRED_4x4_VL
    MB_LEFT,                        // I_PRED_4x4_HU
    MB_LEFT,                        // I_PRED_4x4_DC_LEFT
    MB_TOP,                         // I_PRED_4x4_DC_TOP
    0                               // I_PRED_4x4_DC_128
};


/* XXX mb_type isn't the one written in the bitstream -> only internal usage */
#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_8x8 || (type) == I_16x16 || (type) == I_PCM )
#define IS_SKIP(type)  ( (type) == P_SKIP || (type) == B_SKIP )
#define IS_DIRECT(type)  ( (type) == B_DIRECT )
enum mb_class_e
{
    I_4x4           = 0,
    I_8x8           = 1,
    I_16x16         = 2,
    I_PCM           = 3,

    P_L0            = 4,
    P_8x8           = 5,
    P_SKIP          = 6,

    B_DIRECT        = 7,
    B_L0_L0         = 8,
    B_L0_L1         = 9,
    B_L0_BI         = 10,
    B_L1_L0         = 11,
    B_L1_L1         = 12,
    B_L1_BI         = 13,
    B_BI_L0         = 14,
    B_BI_L1         = 15,
    B_BI_BI         = 16,
    B_8x8           = 17,
    B_SKIP          = 18,

    X264_MBTYPE_MAX = 19
};
static const uint8_t x264_mb_type_fix[X264_MBTYPE_MAX] =
{
    I_4x4, I_4x4, I_16x16, I_PCM,
    P_L0, P_8x8, P_SKIP,
    B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1,
    B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP
};
static const uint8_t x264_mb_type_list_table[X264_MBTYPE_MAX][2][2] =
{
    {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, /* INTRA */
    {{1,1},{0,0}},                                              /* P_L0 */
    {{0,0},{0,0}},                                              /* P_8x8 */
    {{1,1},{0,0}},                                              /* P_SKIP */
    {{0,0},{0,0}},                                              /* B_DIRECT */
    {{1,1},{0,0}}, {{1,0},{0,1}}, {{1,1},{0,1}},                /* B_L0_* */
    {{0,1},{1,0}}, {{0,0},{1,1}}, {{0,1},{1,1}},                /* B_L1_* */
    {{1,1},{1,0}}, {{1,0},{1,1}}, {{1,1},{1,1}},                /* B_BI_* */
    {{0,0},{0,0}},                                              /* B_8x8 */
    {{0,0},{0,0}}                                               /* B_SKIP */
};

#define IS_SUB4x4(type) ( (type ==D_L0_4x4)||(type ==D_L1_4x4)||(type ==D_BI_4x4))
#define IS_SUB4x8(type) ( (type ==D_L0_4x8)||(type ==D_L1_4x8)||(type ==D_BI_4x8))
#define IS_SUB8x4(type) ( (type ==D_L0_8x4)||(type ==D_L1_8x4)||(type ==D_BI_8x4))
#define IS_SUB8x8(type) ( (type ==D_L0_8x8)||(type ==D_L1_8x8)||(type ==D_BI_8x8)||(type ==D_DIRECT_8x8))
enum mb_partition_e
{
    /* sub partition type for P_8x8 and B_8x8 */
    D_L0_4x4          = 0,
    D_L0_8x4          = 1,
    D_L0_4x8          = 2,
    D_L0_8x8          = 3,

    /* sub partition type for B_8x8 only */
    D_L1_4x4          = 4,
    D_L1_8x4          = 5,
    D_L1_4x8          = 6,
    D_L1_8x8          = 7,

    D_BI_4x4          = 8,
    D_BI_8x4          = 9,
    D_BI_4x8          = 10,
    D_BI_8x8          = 11,
    D_DIRECT_8x8      = 12,

    /* partition */
    D_8x8             = 13,
    D_16x8            = 14,
    D_8x16            = 15,
    D_16x16           = 16,
    X264_PARTTYPE_MAX = 17,
};

static const uint8_t x264_mb_partition_listX_table[2][17] =
{{
    1, 1, 1, 1, /* D_L0_* */
    0, 0, 0, 0, /* D_L1_* */
    1, 1, 1, 1, /* D_BI_* */
    0,          /* D_DIRECT_8x8 */
    0, 0, 0, 0  /* 8x8 .. 16x16 */
},
{
    0, 0, 0, 0, /* D_L0_* */
    1, 1, 1, 1, /* D_L1_* */
    1, 1, 1, 1, /* D_BI_* */
    0,          /* D_DIRECT_8x8 */
    0, 0, 0, 0  /* 8x8 .. 16x16 */
}};
static const uint8_t x264_mb_partition_count_table[17] =
{
    /* sub L0 */
    4, 2, 2, 1,
    /* sub L1 */
    4, 2, 2, 1,
    /* sub BI */
    4, 2, 2, 1,
    /* Direct */
    1,
    /* Partition */
    4, 2, 2, 1
};
static const uint8_t x264_mb_partition_pixel_table[17] =
{
    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_L0_* */
    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_L1_* */
    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_BI_* */
    PIXEL_8x8,                                      /* D_DIRECT_8x8 */
    PIXEL_8x8, PIXEL_16x8, PIXEL_8x16, PIXEL_16x16, /* 8x8 .. 16x16 */
};

/* zigzags are transposed with respect to the tables in the standard */
static const uint8_t x264_zigzag_scan4[2][16] =
{{ // frame
    0,  4,  1,  2,  5,  8, 12,  9,  6,  3,  7, 10, 13, 14, 11, 15
},
{  // field
    0,  1,  4,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
}};
static const uint8_t x264_zigzag_scan8[2][64] =
{{
    0,  8,  1,  2,  9, 16, 24, 17, 10,  3,  4, 11, 18, 25, 32, 40,
   33, 26, 19, 12,  5,  6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
   28, 21, 14,  7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
   23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
},
{
    0,  1,  2,  8,  9,  3,  4, 10, 16, 11,  5,  6,  7, 12, 17, 24,
   18, 13, 14, 15, 19, 25, 32, 26, 20, 21, 22, 23, 27, 33, 40, 34,
   28, 29, 30, 31, 35, 41, 48, 42, 36, 37, 38, 39, 43, 49, 50, 44,
   45, 46, 47, 51, 56, 57, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63
}};

static const uint8_t block_idx_x[16] =
{
    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
};
static const uint8_t block_idx_y[16] =
{
    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
};
static const uint8_t block_idx_xy[4][4] =
{
    { 0, 2, 8,  10 },
    { 1, 3, 9,  11 },
    { 4, 6, 12, 14 },
    { 5, 7, 13, 15 }
};
static const uint8_t block_idx_xy_1d[16] =
{
    0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
};
static const uint8_t block_idx_yx_1d[16] =
{
    0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
};
static const uint8_t block_idx_xy_fenc[16] =
{
    0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE,
    0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE,
    2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE,
    2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE,
    0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE,
    0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE,
    2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE,
    2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE
};
static const uint16_t block_idx_xy_fdec[16] =
{
    0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE,
    0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE,
    2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE,
    2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE,
    0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE,
    0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE,
    2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE,
    2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
};

#define QP(qP) ( (qP)+QP_BD_OFFSET )
static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
{
         0,      0,      0,      0,      0,      0,
         0,      0,      0,      0,      0,      0,
#if BIT_DEPTH > 9
   QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
#endif
#if BIT_DEPTH > 8
    QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
#endif
     QP(0),  QP(1),  QP(2),  QP(3),  QP(4),  QP(5),
     QP(6),  QP(7),  QP(8),  QP(9), QP(10), QP(11),
    QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
    QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
    QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
    QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
    QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
    QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
    QP(39), QP(39), QP(39), QP(39),
    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
};
#undef QP

enum cabac_ctx_block_cat_e
{
    DCT_LUMA_DC     = 0,
    DCT_LUMA_AC     = 1,
    DCT_LUMA_4x4    = 2,
    DCT_CHROMA_DC   = 3,
    DCT_CHROMA_AC   = 4,
    DCT_LUMA_8x8    = 5,
    DCT_CHROMAU_DC  = 6,
    DCT_CHROMAU_AC  = 7,
    DCT_CHROMAU_4x4 = 8,
    DCT_CHROMAU_8x8 = 9,
    DCT_CHROMAV_DC  = 10,
    DCT_CHROMAV_AC  = 11,
    DCT_CHROMAV_4x4 = 12,
    DCT_CHROMAV_8x8 = 13,
};

static const uint8_t ctx_cat_plane[6][3] =
{
    { DCT_LUMA_DC,  DCT_CHROMAU_DC,  DCT_CHROMAV_DC},
    { DCT_LUMA_AC,  DCT_CHROMAU_AC,  DCT_CHROMAV_AC},
    {DCT_LUMA_4x4, DCT_CHROMAU_4x4, DCT_CHROMAV_4x4},
    {0},
    {0},
    {DCT_LUMA_8x8, DCT_CHROMAU_8x8, DCT_CHROMAV_8x8}
};

/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
int  x264_macroblock_cache_allocate( x264_t *h );
void x264_macroblock_cache_free( x264_t *h );

/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
int  x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
void x264_macroblock_thread_free( x264_t *h, int b_lookahead );

void x264_macroblock_slice_init( x264_t *h );
void x264_macroblock_thread_init( x264_t *h );
void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y );
void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y );
void x264_macroblock_deblock_strength( x264_t *h );
void x264_macroblock_cache_save( x264_t *h );

void x264_macroblock_bipred_init( x264_t *h );

void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );

void x264_copy_column8( pixel *dst, pixel *src );

/* x264_mb_predict_mv_16x16:
 *      set mvp with predicted mv for D_16x16 block
 *      h->mb. need only valid values from other blocks */
void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] );
/* x264_mb_predict_mv_pskip:
 *      set mvp with predicted mv for P_SKIP
 *      h->mb. need only valid values from other blocks */
void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] );
/* x264_mb_predict_mv:
 *      set mvp with predicted mv for all blocks except SKIP and DIRECT
 *      h->mb. need valid ref/partition/sub of current block to be valid
 *      and valid mv/ref from other blocks. */
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] );
/* x264_mb_predict_mv_direct16x16:
 *      set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT
 *      h->mb. need only valid values from other blocks.
 *      return 1 on success, 0 on failure.
 *      if b_changed != NULL, set it to whether refs or mvs differ from
 *      before this functioncall. */
int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
/* x264_mb_predict_mv_ref16x16:
 *      set mvc with D_16x16 prediction.
 *      uses all neighbors, even those that didn't end up using this ref.
 *      h->mb. need only valid values from other blocks */
void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc );

void x264_mb_mc( x264_t *h );
void x264_mb_mc_8x8( x264_t *h, int i8 );

static ALWAYS_INLINE uint32_t pack16to32( uint32_t a, uint32_t b )
{
#if WORDS_BIGENDIAN
   return b + (a<<16);
#else
   return a + (b<<16);
#endif
}
static ALWAYS_INLINE uint32_t pack8to16( uint32_t a, uint32_t b )
{
#if WORDS_BIGENDIAN
   return b + (a<<8);
#else
   return a + (b<<8);
#endif
}
static ALWAYS_INLINE uint32_t pack8to32( uint32_t a, uint32_t b, uint32_t c, uint32_t d )
{
#if WORDS_BIGENDIAN
   return d + (c<<8) + (b<<16) + (a<<24);
#else
   return a + (b<<8) + (c<<16) + (d<<24);
#endif
}
static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
{
#if WORDS_BIGENDIAN
   return (b&0xFFFF) + (a<<16);
#else
   return (a&0xFFFF) + (b<<16);
#endif
}
static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
{
#if WORDS_BIGENDIAN
   return b + ((uint64_t)a<<32);
#else
   return a + ((uint64_t)b<<32);
#endif
}

#if HIGH_BIT_DEPTH
#   define pack_pixel_1to2 pack16to32
#   define pack_pixel_2to4 pack32to64
#else
#   define pack_pixel_1to2 pack8to16
#   define pack_pixel_2to4 pack16to32
#endif

static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
{
    const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
    const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
    const int m  = X264_MIN( x264_mb_pred_mode4x4_fix(ma),
                             x264_mb_pred_mode4x4_fix(mb) );

    if( m < 0 )
        return I_PRED_4x4_DC;

    return m;
}
static ALWAYS_INLINE int x264_mb_predict_non_zero_code( x264_t *h, int idx )
{
    const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
    const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];

    int i_ret = za + zb;

    if( i_ret < 0x80 )
        i_ret = ( i_ret + 1 ) >> 1;
    return i_ret & 0x7f;
}
/* x264_mb_transform_8x8_allowed:
 *      check whether any partition is smaller than 8x8 (or at least
 *      might be, according to just partition type.)
 *      doesn't check for cbp */
static ALWAYS_INLINE int x264_mb_transform_8x8_allowed( x264_t *h )
{
    // intra and skip are disallowed
    // large partitions are allowed
    // direct and 8x8 are conditional
    static const uint8_t partition_tab[X264_MBTYPE_MAX] = {
        0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0,
    };

    if( !h->pps->b_transform_8x8_mode )
        return 0;
    if( h->mb.i_type != P_8x8 )
        return partition_tab[h->mb.i_type];
    return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
}

#endif

x264-snapshot-20120103-2245-stable/common/macroblock.c0000644000175000001440000023773211700673342021457 0ustar  videolanusers/*****************************************************************************
 * macroblock.c: macroblock common functions
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Henrik Gramner <hengar-6@student.ltu.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"
#include "encoder/me.h"

#define MC_LUMA(list,p) \
    h->mc.mc_luma( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
                   &h->mb.pic.p_fref[list][i_ref][p*4], h->mb.pic.i_stride[p], \
                   mvx, mvy, 4*width, 4*height, \
                   list ? x264_weight_none : &h->sh.weight[i_ref][p] );

static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
{
    int i8    = x264_scan8[0]+x+8*y;
    int i_ref = h->mb.cache.ref[0][i8];
    int mvx   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
    int mvy   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;

    MC_LUMA( 0, 0 );

    if( CHROMA444 )
    {
        MC_LUMA( 0, 1 );
        MC_LUMA( 0, 2 );
    }
    else
    {
        int v_shift = CHROMA_V_SHIFT;
        // Chroma in 4:2:0 is offset if MCing from a field of opposite parity
        if( v_shift & MB_INTERLACED & i_ref )
            mvy += (h->mb.i_mb_y & 1)*4 - 2;

        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
        height = 4*height >> v_shift;

        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                         h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
                         mvx, 2*mvy>>v_shift, 2*width, height );

        if( h->sh.weight[i_ref][1].weightfn )
            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
                                                       &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
                                                       &h->sh.weight[i_ref][1], height );
        if( h->sh.weight[i_ref][2].weightfn )
            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                                                       &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                                                       &h->sh.weight[i_ref][2], height );
    }
}
static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
{
    int i8    = x264_scan8[0]+x+8*y;
    int i_ref = h->mb.cache.ref[1][i8];
    int mvx   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
    int mvy   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;

    MC_LUMA( 1, 0 );

    if( CHROMA444 )
    {
        MC_LUMA( 1, 1 );
        MC_LUMA( 1, 2 );
    }
    else
    {
        int v_shift = CHROMA_V_SHIFT;
        if( v_shift & MB_INTERLACED & i_ref )
            mvy += (h->mb.i_mb_y & 1)*4 - 2;

        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                         h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
                         mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift );
    }
}

#define MC_LUMA_BI(p) \
    src0 = h->mc.get_ref( tmp0, &i_stride0, &h->mb.pic.p_fref[0][i_ref0][p*4], h->mb.pic.i_stride[p], \
                          mvx0, mvy0, 4*width, 4*height, x264_weight_none ); \
    src1 = h->mc.get_ref( tmp1, &i_stride1, &h->mb.pic.p_fref[1][i_ref1][p*4], h->mb.pic.i_stride[p], \
                          mvx1, mvy1, 4*width, 4*height, x264_weight_none ); \
    h->mc.avg[i_mode]( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
                       src0, i_stride0, src1, i_stride1, weight );

static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
{
    int i8 = x264_scan8[0]+x+8*y;
    int i_ref0 = h->mb.cache.ref[0][i8];
    int i_ref1 = h->mb.cache.ref[1][i8];
    int weight = h->mb.bipred_weight[i_ref0][i_ref1];
    int mvx0   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
    int mvx1   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
    int mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
    int mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
    int i_mode = x264_size2pixel[height][width];
    int i_stride0 = 16, i_stride1 = 16;
    ALIGNED_ARRAY_16( pixel, tmp0,[16*16] );
    ALIGNED_ARRAY_16( pixel, tmp1,[16*16] );
    pixel *src0, *src1;

    MC_LUMA_BI( 0 );

    if( CHROMA444 )
    {
        MC_LUMA_BI( 1 );
        MC_LUMA_BI( 2 );
    }
    else
    {
        int v_shift = CHROMA_V_SHIFT;
        if( v_shift & MB_INTERLACED & i_ref0 )
            mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
        if( v_shift & MB_INTERLACED & i_ref1 )
            mvy1 += (h->mb.i_mb_y & 1)*4 - 2;

        h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
                         mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift );
        h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
                         mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift );

        int chromapix = h->luma2chroma_pixel[i_mode];
        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0,   16, tmp1,   16, weight );
        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
    }
}

#undef MC_LUMA
#undef MC_LUMA_BI

void x264_mb_mc_8x8( x264_t *h, int i8 )
{
    int x = 2*(i8&1);
    int y = 2*(i8>>1);

    if( h->sh.i_type == SLICE_TYPE_P )
    {
        switch( h->mb.i_sub_partition[i8] )
        {
            case D_L0_8x8:
                x264_mb_mc_0xywh( h, x, y, 2, 2 );
                break;
            case D_L0_8x4:
                x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
                x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
                break;
            case D_L0_4x8:
                x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
                x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
                break;
            case D_L0_4x4:
                x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
                x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
                x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
                x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
                break;
        }
    }
    else
    {
        int scan8 = x264_scan8[0] + x + 8*y;

        if( h->mb.cache.ref[0][scan8] >= 0 )
            if( h->mb.cache.ref[1][scan8] >= 0 )
                x264_mb_mc_01xywh( h, x, y, 2, 2 );
            else
                x264_mb_mc_0xywh( h, x, y, 2, 2 );
        else
            x264_mb_mc_1xywh( h, x, y, 2, 2 );
    }
}

void x264_mb_mc( x264_t *h )
{
    if( h->mb.i_partition == D_8x8 )
    {
        for( int i = 0; i < 4; i++ )
            x264_mb_mc_8x8( h, i );
    }
    else
    {
        int ref0a = h->mb.cache.ref[0][x264_scan8[ 0]];
        int ref0b = h->mb.cache.ref[0][x264_scan8[12]];
        int ref1a = h->mb.cache.ref[1][x264_scan8[ 0]];
        int ref1b = h->mb.cache.ref[1][x264_scan8[12]];

        if( h->mb.i_partition == D_16x16 )
        {
            if( ref0a >= 0 )
                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
                else             x264_mb_mc_0xywh ( h, 0, 0, 4, 4 );
            else                 x264_mb_mc_1xywh ( h, 0, 0, 4, 4 );
        }
        else if( h->mb.i_partition == D_16x8 )
        {
            if( ref0a >= 0 )
                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 4, 2 );
                else             x264_mb_mc_0xywh ( h, 0, 0, 4, 2 );
            else                 x264_mb_mc_1xywh ( h, 0, 0, 4, 2 );

            if( ref0b >= 0 )
                if( ref1b >= 0 ) x264_mb_mc_01xywh( h, 0, 2, 4, 2 );
                else             x264_mb_mc_0xywh ( h, 0, 2, 4, 2 );
            else                 x264_mb_mc_1xywh ( h, 0, 2, 4, 2 );
        }
        else if( h->mb.i_partition == D_8x16 )
        {
            if( ref0a >= 0 )
                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 2, 4 );
                else             x264_mb_mc_0xywh ( h, 0, 0, 2, 4 );
            else                 x264_mb_mc_1xywh ( h, 0, 0, 2, 4 );

            if( ref0b >= 0 )
                if( ref1b >= 0 ) x264_mb_mc_01xywh( h, 2, 0, 2, 4 );
                else             x264_mb_mc_0xywh ( h, 2, 0, 2, 4 );
            else                 x264_mb_mc_1xywh ( h, 2, 0, 2, 4 );
        }
    }
}

int x264_macroblock_cache_allocate( x264_t *h )
{
    int i_mb_count = h->mb.i_mb_count;

    h->mb.i_mb_stride = h->mb.i_mb_width;
    h->mb.i_b8_stride = h->mb.i_mb_width * 2;
    h->mb.i_b4_stride = h->mb.i_mb_width * 4;

    h->mb.b_interlaced = PARAM_INTERLACED;

    CHECKED_MALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
    CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
    CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
    CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );

    /* 0 -> 3 top(4), 4 -> 6 : left(3) */
    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );

    /* all coeffs */
    CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );

    if( h->param.b_cabac )
    {
        CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
        CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
        CHECKED_MALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) );
        if( h->param.i_bframe )
            CHECKED_MALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) );
    }

    for( int i = 0; i < 2; i++ )
    {
        int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED;
        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
            i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit

        for( int j = !i; j < i_refs; j++ )
        {
            CHECKED_MALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) );
            M32( h->mb.mvr[i][j][0] ) = 0;
            h->mb.mvr[i][j]++;
        }
    }

    if( h->param.analyse.i_weighted_pred )
    {
        int i_padv = PADV << PARAM_INTERLACED;
        int luma_plane_size = 0;
        int numweightbuf;

        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
        {
            // only need buffer for lookahead
            if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
            {
                // Fake analysis only works on lowres
                luma_plane_size = h->fdec->i_stride_lowres * (h->mb.i_mb_height*8+2*i_padv);
                // Only need 1 buffer for analysis
                numweightbuf = 1;
            }
            else
                numweightbuf = 0;
        }
        else
        {
            /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4
             * needs the same amount of space and 4:2:2 needs twice that much */
            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv);

            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
                //smart can weight one ref and one offset -1 in 8-bit
                numweightbuf = 1 + (BIT_DEPTH == 8);
            else
                //simple only has one weighted ref
                numweightbuf = 1;
        }

        for( int i = 0; i < numweightbuf; i++ )
            CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
    }

    return 0;
fail:
    return -1;
}
void x264_macroblock_cache_free( x264_t *h )
{
    for( int i = 0; i < 2; i++ )
        for( int j = !i; j < X264_REF_MAX*2; j++ )
            if( h->mb.mvr[i][j] )
                x264_free( h->mb.mvr[i][j]-1 );
    for( int i = 0; i < X264_REF_MAX; i++ )
        x264_free( h->mb.p_weight_buf[i] );

    if( h->param.b_cabac )
    {
        x264_free( h->mb.skipbp );
        x264_free( h->mb.chroma_pred_mode );
        x264_free( h->mb.mvd[0] );
        x264_free( h->mb.mvd[1] );
    }
    x264_free( h->mb.slice_table );
    x264_free( h->mb.intra4x4_pred_mode );
    x264_free( h->mb.non_zero_count );
    x264_free( h->mb.mb_transform_size );
    x264_free( h->mb.cbp );
    x264_free( h->mb.qp );
}

int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
{
    if( !b_lookahead )
    {
        for( int i = 0; i <= 4*PARAM_INTERLACED; i++ )
            for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
            {
                CHECKED_MALLOC( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
                h->intra_border_backup[i][j] += 16;
                if( !PARAM_INTERLACED )
                    h->intra_border_backup[1][j] = h->intra_border_backup[i][j];
            }
        for( int i = 0; i <= PARAM_INTERLACED; i++ )
        {
            CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
            h->deblock_strength[1] = h->deblock_strength[i];
        }
    }

    /* Allocate scratch buffer */
    int scratch_size = 0;
    if( !b_lookahead )
    {
        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
            ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
        scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
    }
    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
    scratch_size = X264_MAX( scratch_size, buf_mbtree );
    if( scratch_size )
        CHECKED_MALLOC( h->scratch_buffer, scratch_size );
    else
        h->scratch_buffer = NULL;

    return 0;
fail:
    return -1;
}

void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
{
    if( !b_lookahead )
    {
        for( int i = 0; i <= PARAM_INTERLACED; i++ )
            x264_free( h->deblock_strength[i] );
        for( int i = 0; i <= 4*PARAM_INTERLACED; i++ )
            for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
                x264_free( h->intra_border_backup[i][j] - 16 );
    }
    x264_free( h->scratch_buffer );
}

void x264_macroblock_slice_init( x264_t *h )
{
    h->mb.mv[0] = h->fdec->mv[0];
    h->mb.mv[1] = h->fdec->mv[1];
    h->mb.mvr[0][0] = h->fdec->mv16x16;
    h->mb.ref[0] = h->fdec->ref[0];
    h->mb.ref[1] = h->fdec->ref[1];
    h->mb.type = h->fdec->mb_type;
    h->mb.partition = h->fdec->mb_partition;
    h->mb.field = h->fdec->field;

    h->fdec->i_ref[0] = h->i_ref[0];
    h->fdec->i_ref[1] = h->i_ref[1];
    for( int i = 0; i < h->i_ref[0]; i++ )
        h->fdec->ref_poc[0][i] = h->fref[0][i]->i_poc;
    if( h->sh.i_type == SLICE_TYPE_B )
    {
        for( int i = 0; i < h->i_ref[1]; i++ )
            h->fdec->ref_poc[1][i] = h->fref[1][i]->i_poc;

        map_col_to_list0(-1) = -1;
        map_col_to_list0(-2) = -2;
        for( int i = 0; i < h->fref[1][0]->i_ref[0]; i++ )
        {
            int poc = h->fref[1][0]->ref_poc[0][i];
            map_col_to_list0(i) = -2;
            for( int j = 0; j < h->i_ref[0]; j++ )
                if( h->fref[0][j]->i_poc == poc )
                {
                    map_col_to_list0(i) = j;
                    break;
                }
        }
    }
    else if( h->sh.i_type == SLICE_TYPE_P )
    {
        if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
        {
            deblock_ref_table(-2) = -2;
            deblock_ref_table(-1) = -1;
            for( int i = 0; i < h->i_ref[0] << SLICE_MBAFF; i++ )
            {
                /* Mask off high bits to avoid frame num collisions with -1/-2.
                 * In current x264 frame num values don't cover a range of more
                 * than 32, so 6 bits is enough for uniqueness. */
                if( !MB_INTERLACED )
                    deblock_ref_table(i) = h->fref[0][i]->i_frame_num&63;
                else
                    deblock_ref_table(i) = ((h->fref[0][i>>1]->i_frame_num&63)<<1) + (i&1);
            }
        }
    }

    /* init with not available (for top right idx=7,15) */
    memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );

    if( h->i_ref[0] > 0 )
        for( int field = 0; field <= SLICE_MBAFF; field++ )
        {
            int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
            int refpoc = h->fref[0][0]->i_poc + h->fref[0][0]->i_delta_poc[field];
            int delta = curpoc - refpoc;

            h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
        }

    h->mb.i_neighbour4[6] =
    h->mb.i_neighbour4[9] =
    h->mb.i_neighbour4[12] =
    h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
    h->mb.i_neighbour4[3] =
    h->mb.i_neighbour4[7] =
    h->mb.i_neighbour4[11] =
    h->mb.i_neighbour4[13] =
    h->mb.i_neighbour4[15] =
    h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
}

void x264_macroblock_thread_init( x264_t *h )
{
    h->mb.i_me_method = h->param.analyse.i_me_method;
    h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
    if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
        h->mb.i_subpel_refine--;
    h->mb.b_chroma_me = h->param.analyse.b_chroma_me &&
                        ((h->sh.i_type == SLICE_TYPE_P && h->mb.i_subpel_refine >= 5) ||
                         (h->sh.i_type == SLICE_TYPE_B && h->mb.i_subpel_refine >= 9));
    h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
                          (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
    h->mb.i_mb_prev_xy = -1;

    /*          4:2:0                      4:2:2                      4:4:4
     * fdec            fenc       fdec            fenc       fdec            fenc
     * y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y
     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
     * y Y Y Y Y       U U V V    y Y Y Y Y       U U V V    y Y Y Y Y       U U U U
     * u u u   v v v   U U V V    u u u   v v v   U U V V    u u u u u u u   U U U U
     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
     *                            u U U   v V V              u U U U U       V V V V
     *                            u U U   v V V              u U U U U       V V V V
     *                                                       v v v v v v v   V V V V
     *                                                       v V V V V       V V V V
     *                                                       v V V V V
     *                                                       v V V V V
     *                                                       v V V V V
     */
    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
    h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
    h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
    if( CHROMA444 )
    {
        h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
    }
    else
    {
        h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
    }
}

void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
{
    int stride_y  = fenc->i_stride[0];
    int stride_uv = fenc->i_stride[1];
    int off_y  = 16 * i_mb_x + 16 * i_mb_y * stride_y;
    int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> CHROMA_V_SHIFT);
    h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
                         fenc->plane[1]+off_uv, stride_uv, i_mb_x );
}

NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
{
    // input pointers are offset by 4 rows because that's faster (smaller instruction size on x86)
    for( int i = -4; i < 4; i++ )
        dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE];
}

static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
{
    int mb_interlaced = b_mbaff && MB_INTERLACED;
    int height = b_chroma ? 16 >> CHROMA_V_SHIFT : 16;
    int i_stride = h->fdec->i_stride[i];
    int i_stride2 = i_stride << mb_interlaced;
    int i_pix_offset = mb_interlaced
                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
                     : 16 * mb_x + height * mb_y * i_stride;
    pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
    int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0;
    pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16];
    int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
    /* ref_pix_offset[0] references the current field and [1] the opposite field. */
    if( mb_interlaced )
        ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride;
    h->mb.pic.i_stride[i] = i_stride2;
    h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
    if( b_chroma )
    {
        h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
        memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
        memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
        if( b_mbaff )
        {
            h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8];
            h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1];
        }
    }
    else
    {
        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 );
        memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
        if( b_mbaff )
            h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1];
    }
    if( b_mbaff )
    {
        for( int j = 0; j < height; j++ )
            if( b_chroma )
            {
                h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
                h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
            }
            else
                h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
    }
    pixel *plane_src, **filtered_src;
    for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
    {
        // Interpolate between pixels in same field.
        if( mb_interlaced )
        {
            plane_src = h->fref[0][j>>1]->plane_fld[i];
            filtered_src = h->fref[0][j>>1]->filtered_fld[i];
        }
        else
        {
            plane_src = h->fref[0][j]->plane[i];
            filtered_src = h->fref[0][j]->filtered[i];
        }
        h->mb.pic.p_fref[0][j][i*4] = plane_src + ref_pix_offset[j&1];

        if( !b_chroma )
        {
            for( int k = 1; k < 4; k++ )
                h->mb.pic.p_fref[0][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1];
            if( !i )
            {
                if( h->sh.weight[j][0].weightfn )
                    h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> mb_interlaced][ref_pix_offset[j&1]];
                else
                    h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
            }
        }
    }
    if( h->sh.i_type == SLICE_TYPE_B )
        for( int j = 0; j < h->mb.pic.i_fref[1]; j++ )
        {
            if( mb_interlaced )
            {
                plane_src = h->fref[1][j>>1]->plane_fld[i];
                filtered_src = h->fref[1][j>>1]->filtered_fld[i];
            }
            else
            {
                plane_src = h->fref[1][j]->plane[i];
                filtered_src = h->fref[1][j]->filtered[i];
            }
            h->mb.pic.p_fref[1][j][i*4] = plane_src + ref_pix_offset[j&1];

            if( !b_chroma )
                for( int k = 1; k < 4; k++ )
                    h->mb.pic.p_fref[1][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1];
        }
}

static const x264_left_table_t left_indices[4] =
{
    /* Current is progressive */
    {{ 4, 4, 5, 5}, { 3,  3,  7,  7}, {16+1, 16+1, 32+1, 32+1}, {0, 0, 1, 1}, {0, 0, 0, 0}},
    {{ 6, 6, 3, 3}, {11, 11, 15, 15}, {16+5, 16+5, 32+5, 32+5}, {2, 2, 3, 3}, {1, 1, 1, 1}},
    /* Current is interlaced */
    {{ 4, 6, 4, 6}, { 3, 11,  3, 11}, {16+1, 16+1, 32+1, 32+1}, {0, 2, 0, 2}, {0, 1, 0, 1}},
    /* Both same */
    {{ 4, 5, 6, 3}, { 3,  7, 11, 15}, {16+1, 16+5, 32+1, 32+5}, {0, 1, 2, 3}, {0, 0, 1, 1}}
};

static void ALWAYS_INLINE x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y, int b_interlaced )
{
    const int mb_interlaced = b_interlaced && MB_INTERLACED;
    int top_y = mb_y - (1 << mb_interlaced);
    int top = top_y * h->mb.i_mb_stride + mb_x;

    h->mb.i_mb_x = mb_x;
    h->mb.i_mb_y = mb_y;
    h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
    h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x);
    h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x);
    h->mb.left_b8[0] =
    h->mb.left_b8[1] = -1;
    h->mb.left_b4[0] =
    h->mb.left_b4[1] = -1;
    h->mb.i_neighbour = 0;
    h->mb.i_neighbour_intra = 0;
    h->mb.i_neighbour_frame = 0;
    h->mb.i_mb_top_xy = -1;
    h->mb.i_mb_top_y = -1;
    h->mb.i_mb_left_xy[0] = h->mb.i_mb_left_xy[1] = -1;
    h->mb.i_mb_topleft_xy = -1;
    h->mb.i_mb_topright_xy = -1;
    h->mb.i_mb_type_top = -1;
    h->mb.i_mb_type_left[0] = h->mb.i_mb_type_left[1] = -1;
    h->mb.i_mb_type_topleft = -1;
    h->mb.i_mb_type_topright = -1;
    h->mb.left_index_table = &left_indices[3];
    h->mb.topleft_partition = 0;

    int topleft_y = top_y;
    int topright_y = top_y;
    int left[2];

    left[0] = left[1] = h->mb.i_mb_xy - 1;
    h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2;
    h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4;

    if( b_interlaced )
    {
        h->mb.i_mb_top_mbpair_xy = h->mb.i_mb_xy - 2*h->mb.i_mb_stride;
        h->mb.i_mb_topleft_y = -1;
        h->mb.i_mb_topright_y = -1;

        if( mb_y&1 )
        {
            if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] )
            {
                left[0] = left[1] = h->mb.i_mb_xy - 1 - h->mb.i_mb_stride;
                h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2 - 2*h->mb.i_b8_stride;
                h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4 - 4*h->mb.i_b4_stride;

                if( mb_interlaced )
                {
                    h->mb.left_index_table = &left_indices[2];
                    left[1] += h->mb.i_mb_stride;
                    h->mb.left_b8[1] += 2*h->mb.i_b8_stride;
                    h->mb.left_b4[1] += 4*h->mb.i_b4_stride;
                }
                else
                {
                    h->mb.left_index_table = &left_indices[1];
                    topleft_y++;
                    h->mb.topleft_partition = 1;
                }
            }
            if( !mb_interlaced )
                topright_y = -1;
        }
        else
        {
            if( mb_interlaced && top >= 0 )
            {
                if( !h->mb.field[top] )
                {
                    top += h->mb.i_mb_stride;
                    top_y++;
                }
                if( mb_x )
                    topleft_y += !h->mb.field[h->mb.i_mb_stride*topleft_y + mb_x - 1];
                if( mb_x < h->mb.i_mb_width-1 )
                    topright_y += !h->mb.field[h->mb.i_mb_stride*topright_y + mb_x + 1];
            }
            if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] )
            {
                if( mb_interlaced )
                {
                    h->mb.left_index_table = &left_indices[2];
                    left[1] += h->mb.i_mb_stride;
                    h->mb.left_b8[1] += 2*h->mb.i_b8_stride;
                    h->mb.left_b4[1] += 4*h->mb.i_b4_stride;
                }
                else
                    h->mb.left_index_table = &left_indices[0];
            }
        }
    }

    if( mb_x > 0 )
    {
        h->mb.i_neighbour_frame |= MB_LEFT;
        h->mb.i_mb_left_xy[0] = left[0];
        h->mb.i_mb_left_xy[1] = left[1];
        h->mb.i_mb_type_left[0] = h->mb.type[h->mb.i_mb_left_xy[0]];
        h->mb.i_mb_type_left[1] = h->mb.type[h->mb.i_mb_left_xy[1]];
        if( h->mb.slice_table[left[0]] == h->sh.i_first_mb )
        {
            h->mb.i_neighbour |= MB_LEFT;

            // FIXME: We don't currently support constrained intra + mbaff.
            if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left[0] ) )
                h->mb.i_neighbour_intra |= MB_LEFT;
        }
    }

    /* We can't predict from the previous threadslice since it hasn't been encoded yet. */
    if( (h->i_threadslice_start >> mb_interlaced) != (mb_y >> mb_interlaced) )
    {
        if( top >= 0 )
        {
            h->mb.i_neighbour_frame |= MB_TOP;
            h->mb.i_mb_top_xy = top;
            h->mb.i_mb_top_y = top_y;
            h->mb.i_mb_type_top = h->mb.type[h->mb.i_mb_top_xy];
            if( h->mb.slice_table[top] == h->sh.i_first_mb )
            {
                h->mb.i_neighbour |= MB_TOP;

                if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) )
                    h->mb.i_neighbour_intra |= MB_TOP;

                /* We only need to prefetch the top blocks because the left was just written
                 * to as part of the previous cache_save.  Since most target CPUs use write-allocate
                 * caches, left blocks are near-guaranteed to be in L1 cache.  Top--not so much. */
                x264_prefetch( &h->mb.cbp[top] );
                x264_prefetch( h->mb.intra4x4_pred_mode[top] );
                x264_prefetch( &h->mb.non_zero_count[top][12] );
                /* These aren't always allocated, but prefetching an invalid address can't hurt. */
                x264_prefetch( &h->mb.mb_transform_size[top] );
                x264_prefetch( &h->mb.skipbp[top] );
            }
        }

        if( mb_x > 0 && topleft_y >= 0  )
        {
            h->mb.i_neighbour_frame |= MB_TOPLEFT;
            h->mb.i_mb_topleft_xy = h->mb.i_mb_stride*topleft_y + mb_x - 1;
            h->mb.i_mb_topleft_y = topleft_y;
            h->mb.i_mb_type_topleft = h->mb.type[h->mb.i_mb_topleft_xy];
            if( h->mb.slice_table[h->mb.i_mb_topleft_xy] == h->sh.i_first_mb )
            {
                h->mb.i_neighbour |= MB_TOPLEFT;

                if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topleft ) )
                    h->mb.i_neighbour_intra |= MB_TOPLEFT;
            }
        }

        if( mb_x < h->mb.i_mb_width - 1 && topright_y >= 0 )
        {
            h->mb.i_neighbour_frame |= MB_TOPRIGHT;
            h->mb.i_mb_topright_xy = h->mb.i_mb_stride*topright_y + mb_x + 1;
            h->mb.i_mb_topright_y = topright_y;
            h->mb.i_mb_type_topright = h->mb.type[h->mb.i_mb_topright_xy];
            if( h->mb.slice_table[h->mb.i_mb_topright_xy] == h->sh.i_first_mb )
            {
                h->mb.i_neighbour |= MB_TOPRIGHT;

                if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topright ) )
                    h->mb.i_neighbour_intra |= MB_TOPRIGHT;
            }
        }
    }
}

#define LTOP 0
#if HAVE_INTERLACED
#   define LBOT 1
#else
#   define LBOT 0
#endif

static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, int b_mbaff )
{
    x264_macroblock_cache_load_neighbours( h, mb_x, mb_y, b_mbaff );

    int *left = h->mb.i_mb_left_xy;
    int top  = h->mb.i_mb_top_xy;
    int top_y = h->mb.i_mb_top_y;
    int s8x8 = h->mb.i_b8_stride;
    int s4x4 = h->mb.i_b4_stride;
    int top_8x8 = (2*top_y+1) * s8x8 + 2*mb_x;
    int top_4x4 = (4*top_y+3) * s4x4 + 4*mb_x;
    int lists = (1 << h->sh.i_type) & 3;

    /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */
    /* By only dereferencing them once, we avoid this issue. */
    int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
    uint8_t (*nnz)[48] = h->mb.non_zero_count;
    int16_t *cbp = h->mb.cbp;

    const x264_left_table_t *left_index_table = h->mb.left_index_table;

    /* load cache */
    if( h->mb.i_neighbour & MB_TOP )
    {
        h->mb.cache.i_cbp_top = cbp[top];
        /* load intra4x4 */
        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );

        /* load non_zero_count */
        CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] );
        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>CHROMA_V_SHIFT)] );
        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] );

        /* Finish the prefetching */
        for( int l = 0; l < lists; l++ )
        {
            x264_prefetch( &h->mb.mv[l][top_4x4-1] );
            /* Top right being not in the same cacheline as top left will happen
             * once every 4 MBs, so one extra prefetch is worthwhile */
            x264_prefetch( &h->mb.mv[l][top_4x4+4] );
            x264_prefetch( &h->mb.ref[l][top_8x8-1] );
            x264_prefetch( &h->mb.mvd[l][top] );
        }
    }
    else
    {
        h->mb.cache.i_cbp_top = -1;

        /* load intra4x4 */
        M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU;

        /* load non_zero_count */
        M32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8] ) = 0x80808080U;
        M32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8] ) = 0x80808080U;
        M32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8] ) = 0x80808080U;
    }

    if( h->mb.i_neighbour & MB_LEFT )
    {
        int ltop = left[LTOP];
        int lbot = b_mbaff ? left[LBOT] : ltop;
        if( b_mbaff )
        {
            const int16_t top_luma = (cbp[ltop] >> (left_index_table->mv[0]&(~1))) & 2;
            const int16_t bot_luma = (cbp[lbot] >> (left_index_table->mv[2]&(~1))) & 2;
            h->mb.cache.i_cbp_left = (cbp[ltop] & 0xfff0) | (bot_luma<<2) | top_luma;
        }
        else
            h->mb.cache.i_cbp_left = cbp[ltop];

        /* load intra4x4 */
        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] = i4x4[ltop][left_index_table->intra[0]];
        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] = i4x4[ltop][left_index_table->intra[1]];
        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] = i4x4[lbot][left_index_table->intra[2]];
        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[lbot][left_index_table->intra[3]];

        /* load non_zero_count */
        h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] = nnz[ltop][left_index_table->nnz[0]];
        h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] = nnz[ltop][left_index_table->nnz[1]];
        h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]];
        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]];

        if( CHROMA_FORMAT >= CHROMA_422 )
        {
            int offset = (4>>CHROMA_H_SHIFT) - 4;
            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset];
            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset];
            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset];
            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset];
            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset];
            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset];
            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset];
            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset];
        }
        else
        {
            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[0]];
            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[1]];
            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[2]];
            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[3]];
        }
    }
    else
    {
        h->mb.cache.i_cbp_left = -1;

        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] =
        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] =
        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] =
        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = -1;

        /* load non_zero_count */
        h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] =
        h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] =
        h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] =
        h->mb.cache.non_zero_count[x264_scan8[10] - 1] =
        h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] =
        h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] =
        h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] =
        h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80;
        if( CHROMA_FORMAT >= CHROMA_422 )
        {
            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] =
            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] =
            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] =
            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = 0x80;
        }
    }

    if( h->pps->b_transform_8x8_mode )
    {
        h->mb.cache.i_neighbour_transform_size =
            ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left[0]] )
          + ( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top]  );
    }

    if( b_mbaff )
    {
        h->mb.pic.i_fref[0] = h->i_ref[0] << MB_INTERLACED;
        h->mb.pic.i_fref[1] = h->i_ref[1] << MB_INTERLACED;
    }

    if( !b_mbaff )
    {
        x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
        x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 0 );
        if( CHROMA444 )
        {
            x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+ 4*FDEC_STRIDE );
            x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+12*FDEC_STRIDE );
            x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+ 4*FDEC_STRIDE );
            x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+12*FDEC_STRIDE );
            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 0 );
            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 0 );
        }
        else
        {
            x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
            x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
            if( CHROMA_FORMAT == CHROMA_422 )
            {
                x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE );
                x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE );
            }
            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 );
        }
    }
    else
    {
        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 1 );
        if( CHROMA444 )
        {
            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 1 );
            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 1 );
        }
        else
            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 1 );
    }

    if( h->fdec->integral )
    {
        int offset = 16 * (mb_x + mb_y * h->fdec->i_stride[0]);
        for( int list = 0; list < 2; list++ )
            for( int i = 0; i < h->mb.pic.i_fref[list]; i++ )
                h->mb.pic.p_integral[list][i] = &h->fref[list][i]->integral[offset];
    }

    x264_prefetch_fenc( h, h->fenc, mb_x, mb_y );

    /* load ref/mv/mvd */
    for( int l = 0; l < lists; l++ )
    {
        int16_t (*mv)[2] = h->mb.mv[l];
        int8_t *ref = h->mb.ref[l];

        int i8 = x264_scan8[0] - 1 - 1*8;
        if( h->mb.i_neighbour & MB_TOPLEFT )
        {
            int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topleft_y + mb_x-1)+1+s8x8 : top_8x8 - 1;
            int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topleft_y + mb_x-1)+3+3*s4x4 : top_4x4 - 1;
            if( b_mbaff && h->mb.topleft_partition )
            {
                /* Take motion vector from the middle of macroblock instead of
                 * the bottom right as usual. */
                iv -= 2*s4x4;
                ir -= s8x8;
            }
            h->mb.cache.ref[l][i8] = ref[ir];
            CP32( h->mb.cache.mv[l][i8], mv[iv] );
        }
        else
        {
            h->mb.cache.ref[l][i8] = -2;
            M32( h->mb.cache.mv[l][i8] ) = 0;
        }

        i8 = x264_scan8[0] - 8;
        if( h->mb.i_neighbour & MB_TOP )
        {
            h->mb.cache.ref[l][i8+0] =
            h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
            h->mb.cache.ref[l][i8+2] =
            h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
            CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
        }
        else
        {
            M128( h->mb.cache.mv[l][i8] ) = M128_ZERO;
            M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U;
        }

        i8 = x264_scan8[0] + 4 - 1*8;
        if( h->mb.i_neighbour & MB_TOPRIGHT )
        {
            int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topright_y + (mb_x+1))+s8x8 : top_8x8 + 2;
            int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topright_y + (mb_x+1))+3*s4x4 : top_4x4 + 4;
            h->mb.cache.ref[l][i8] = ref[ir];
            CP32( h->mb.cache.mv[l][i8], mv[iv] );
        }
        else
             h->mb.cache.ref[l][i8] = -2;

        i8 = x264_scan8[0] - 1;
        if( h->mb.i_neighbour & MB_LEFT )
        {
            if( b_mbaff )
            {
                h->mb.cache.ref[l][i8+0*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[0]];
                h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[1]];
                h->mb.cache.ref[l][i8+2*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[2]];
                h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[3]];

                CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[0]] );
                CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[1]] );
                CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[2]] );
                CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[3]] );
            }
            else
            {
                const int ir = h->mb.i_b8_xy - 1;
                const int iv = h->mb.i_b4_xy - 1;
                h->mb.cache.ref[l][i8+0*8] =
                h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
                h->mb.cache.ref[l][i8+2*8] =
                h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];

                CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
                CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
                CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
                CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
            }
        }
        else
        {
            for( int i = 0; i < 4; i++ )
            {
                h->mb.cache.ref[l][i8+i*8] = -2;
                M32( h->mb.cache.mv[l][i8+i*8] ) = 0;
            }
        }

        /* Extra logic for top right mv in mbaff.
         * . . . d  . . a .
         * . . . e  . . . .
         * . . . f  b . c .
         * . . . .  . . . .
         *
         * If the top right of the 4x4 partitions labeled a, b and c in the
         * above diagram do not exist, but the entries d, e and f exist (in
         * the macroblock to the left) then use those instead.
         */
        if( b_mbaff && (h->mb.i_neighbour & MB_LEFT) )
        {
            if( MB_INTERLACED && !h->mb.field[h->mb.i_mb_xy-1] )
            {
                h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*0];
                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*1];
                h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[1] + 1 + s8x8*0];
                CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[0]+1)] );
                CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[1]+1)] );
                CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[1] + 3 + s4x4*(left_index_table->mv[2]+1)] );
            }
            else if( !MB_INTERLACED && h->mb.field[h->mb.i_mb_xy-1] )
            {
                // Looking at the bottom field so always take the bottom macroblock of the pair.
                h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
                h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]];
                CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] );
                CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] );
                CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[2]] );
            }
        }

        if( h->param.b_cabac )
        {
            uint8_t (*mvd)[8][2] = h->mb.mvd[l];
            if( h->mb.i_neighbour & MB_TOP )
                CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] );
            else
                M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0;

            if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1] >= 0) )
            {
                CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left[LTOP]][left_index_table->intra[0]] );
                CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left[LTOP]][left_index_table->intra[1]] );
            }
            else
            {
                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+0*8] ) = 0;
                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+1*8] ) = 0;
            }
            if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1+2*8] >=0) )
            {
                CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left[LBOT]][left_index_table->intra[2]] );
                CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left[LBOT]][left_index_table->intra[3]] );
            }
            else
            {
                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+2*8] ) = 0;
                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+3*8] ) = 0;
            }
        }

        /* If motion vectors are cached from frame macroblocks but this
         * macroblock is a field macroblock then the motion vector must be
         * halved. Similarly, motion vectors from field macroblocks are doubled. */
        if( b_mbaff )
        {
#define MAP_MVS\
                if( FIELD_DIFFERENT(h->mb.i_mb_topleft_xy) )\
                    MAP_F2F(mv, ref, x264_scan8[0] - 1 - 1*8)\
                if( FIELD_DIFFERENT(top) )\
                {\
                    MAP_F2F(mv, ref, x264_scan8[0] + 0 - 1*8)\
                    MAP_F2F(mv, ref, x264_scan8[0] + 1 - 1*8)\
                    MAP_F2F(mv, ref, x264_scan8[0] + 2 - 1*8)\
                    MAP_F2F(mv, ref, x264_scan8[0] + 3 - 1*8)\
                }\
                if( FIELD_DIFFERENT(h->mb.i_mb_topright_xy) )\
                    MAP_F2F(mv, ref, x264_scan8[0] + 4 - 1*8)\
                if( FIELD_DIFFERENT(left[0]) )\
                {\
                    MAP_F2F(mv, ref, x264_scan8[0] - 1 + 0*8)\
                    MAP_F2F(mv, ref, x264_scan8[0] - 1 + 1*8)\
                    MAP_F2F(mv, ref, x264_scan8[0] - 1 + 2*8)\
                    MAP_F2F(mv, ref, x264_scan8[0] - 1 + 3*8)\
                    MAP_F2F(topright_mv, topright_ref, 0)\
                    MAP_F2F(topright_mv, topright_ref, 1)\
                    MAP_F2F(topright_mv, topright_ref, 2)\
                }

            if( MB_INTERLACED )
            {
#define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && !h->mb.field[macroblock])
#define MAP_F2F(varmv, varref, index)\
                if( h->mb.cache.varref[l][index] >= 0 )\
                {\
                    h->mb.cache.varref[l][index] <<= 1;\
                    h->mb.cache.varmv[l][index][1] /= 2;\
                    h->mb.cache.mvd[l][index][1] >>= 1;\
                }
                MAP_MVS
#undef MAP_F2F
#undef FIELD_DIFFERENT
            }
            else
            {
#define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && h->mb.field[macroblock])
#define MAP_F2F(varmv, varref, index)\
                if( h->mb.cache.varref[l][index] >= 0 )\
                {\
                    h->mb.cache.varref[l][index] >>= 1;\
                    h->mb.cache.varmv[l][index][1] <<= 1;\
                    h->mb.cache.mvd[l][index][1] <<= 1;\
                }
                MAP_MVS
#undef MAP_F2F
#undef FIELD_DIFFERENT
            }
        }
    }

    if( b_mbaff && mb_x == 0 && !(mb_y&1) && mb_y > 0 )
        h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_xy - h->mb.i_mb_stride];

    /* Check whether skip here would cause decoder to predict interlace mode incorrectly.
     * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. */
    h->mb.b_allow_skip = 1;
    if( b_mbaff )
    {
        if( MB_INTERLACED != h->mb.field_decoding_flag &&
            h->mb.i_mb_prev_xy >= 0 && IS_SKIP(h->mb.type[h->mb.i_mb_prev_xy]) )
            h->mb.b_allow_skip = 0;
        if( (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) )
        {
            if( h->mb.i_neighbour & MB_LEFT )
            {
                if( h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
                    h->mb.b_allow_skip = 0;
            }
            else if( h->mb.i_neighbour & MB_TOP )
            {
                if( h->mb.field[h->mb.i_mb_top_xy] != MB_INTERLACED )
                    h->mb.b_allow_skip = 0;
            }
            else // Frame mb pair is predicted
            {
                if( MB_INTERLACED )
                    h->mb.b_allow_skip = 0;
            }
        }
    }

    if( h->param.b_cabac )
    {
        if( b_mbaff )
        {
            int left_xy, top_xy;
            /* Neighbours here are calculated based on field_decoding_flag */
            int mb_xy = mb_x + (mb_y&~1)*h->mb.i_mb_stride;
            left_xy = mb_xy - 1;
            if( (mb_y&1) && mb_x > 0 && h->mb.field_decoding_flag == h->mb.field[left_xy] )
                left_xy += h->mb.i_mb_stride;
            if( h->mb.field_decoding_flag )
            {
                top_xy = mb_xy - h->mb.i_mb_stride;
                if( !(mb_y&1) && top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && h->mb.field[top_xy] )
                    top_xy -= h->mb.i_mb_stride;
            }
            else
                top_xy = mb_x + (mb_y-1)*h->mb.i_mb_stride;

            h->mb.cache.i_neighbour_skip =   (mb_x >  0 && h->mb.slice_table[left_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[left_xy] ))
                                         + (top_xy >= 0 && h->mb.slice_table[top_xy]  == h->sh.i_first_mb && !IS_SKIP( h->mb.type[top_xy] ));
        }
        else
        {
            h->mb.cache.i_neighbour_skip = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left[0] ))
                                         + ((h->mb.i_neighbour & MB_TOP)  && !IS_SKIP( h->mb.i_mb_type_top ));
        }
    }

    /* load skip */
    if( h->sh.i_type == SLICE_TYPE_B )
    {
        h->mb.bipred_weight = h->mb.bipred_weight_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)];
        h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)];
        if( h->param.b_cabac )
        {
            uint8_t skipbp;
            x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
            if( b_mbaff )
            {
                skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LTOP]] : 0;
                h->mb.cache.skip[x264_scan8[0] - 1] = (skipbp >> (1+(left_index_table->mv[0]&~1))) & 1;
                skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LBOT]] : 0;
                h->mb.cache.skip[x264_scan8[8] - 1] = (skipbp >> (1+(left_index_table->mv[2]&~1))) & 1;
            }
            else
            {
                skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[0]] : 0;
                h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
                h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
            }
            skipbp = (h->mb.i_neighbour & MB_TOP) ? h->mb.skipbp[top] : 0;
            h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
            h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
        }
    }

    if( h->sh.i_type == SLICE_TYPE_P )
        x264_mb_predict_mv_pskip( h, h->mb.cache.pskip_mv );

    h->mb.i_neighbour4[0] =
    h->mb.i_neighbour8[0] = (h->mb.i_neighbour_intra & (MB_TOP|MB_LEFT|MB_TOPLEFT))
                            | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOPRIGHT : 0);
    h->mb.i_neighbour4[4] =
    h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour_intra & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0);
    h->mb.i_neighbour4[2] =
    h->mb.i_neighbour4[8] =
    h->mb.i_neighbour4[10] =
    h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour_intra & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0);
    h->mb.i_neighbour4[5] =
    h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour_intra & MB_TOPRIGHT)
                            | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
}

void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y )
{
    x264_macroblock_cache_load( h, mb_x, mb_y, 0 );
}

void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y )
{
    x264_macroblock_cache_load( h, mb_x, mb_y, 1 );
}

static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][4] )
{
    if( (h->mb.i_neighbour & MB_LEFT) && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
    {
        static const uint8_t offset[2][2][8] =
        {   {   { 0, 0, 0, 0, 1, 1, 1, 1 },
                { 2, 2, 2, 2, 3, 3, 3, 3 }, },
            {   { 0, 1, 2, 3, 0, 1, 2, 3 },
                { 0, 1, 2, 3, 0, 1, 2, 3 }, }
        };
        ALIGNED_ARRAY_8( uint8_t, tmpbs, [8] );

        const uint8_t *off = offset[MB_INTERLACED][h->mb.i_mb_y&1];
        uint8_t (*nnz)[48] = h->mb.non_zero_count;

        for( int i = 0; i < 8; i++ )
        {
            int left = h->mb.i_mb_left_xy[MB_INTERLACED ? i>>2 : i&1];
            int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8*(i>>1)];
            int nnz_left = nnz[left][3 + 4*off[i]];
            if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
            {
                int j = off[i]&~1;
                if( h->mb.mb_transform_size[left] )
                    nnz_left = !!(M16( &nnz[left][2+4*j] ) | M16( &nnz[left][2+4*(1+j)] ));
            }
            tmpbs[i] = (nnz_left || nnz_this) ? 2 : 1;
        }

        if( MB_INTERLACED )
        {
            CP32( bs[0][0], &tmpbs[0] );
            CP32( bs[0][4], &tmpbs[4] );
        }
        else
        {
            for( int i = 0; i < 4; i++ ) bs[0][0][i] = tmpbs[2*i];
            for( int i = 0; i < 4; i++ ) bs[0][4][i] = tmpbs[1+2*i];
        }
    }

    if( (h->mb.i_neighbour & MB_TOP) && MB_INTERLACED != h->mb.field[h->mb.i_mb_top_xy] )
    {
        if( !(h->mb.i_mb_y&1) && !MB_INTERLACED )
        {
            /* Need to filter both fields (even for frame macroblocks).
             * Filter top two rows using the top macroblock of the above
             * pair and then the bottom one. */
            int mbn_xy = h->mb.i_mb_xy - 2 * h->mb.i_mb_stride;
            uint8_t *nnz_cur = &h->mb.cache.non_zero_count[x264_scan8[0]];

            for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
            {
                uint8_t (*nnz)[48] = h->mb.non_zero_count;

                ALIGNED_4( uint8_t nnz_top[4] );
                CP32( nnz_top, &nnz[mbn_xy][3*4] );

                if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && h->mb.mb_transform_size[mbn_xy] )
                {
                    nnz_top[0] = nnz_top[1] = M16( &nnz[mbn_xy][ 8] ) || M16( &nnz[mbn_xy][12] );
                    nnz_top[2] = nnz_top[3] = M16( &nnz[mbn_xy][10] ) || M16( &nnz[mbn_xy][14] );
                }

                for( int i = 0; i < 4; i++ )
                    bs[1][4*j][i] = (nnz_cur[i] || nnz_top[i]) ? 2 : 1;
            }
        }
        else
            for( int i = 0; i < 4; i++ )
                bs[1][0][i] = X264_MAX( bs[1][0][i], 1 );
    }
}

void x264_macroblock_deblock_strength( x264_t *h )
{
    uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
    if( IS_INTRA( h->mb.i_type ) )
    {
        memset( bs[0][1], 3, 3*4*sizeof(uint8_t) );
        memset( bs[1][1], 3, 3*4*sizeof(uint8_t) );
        return;
    }

    /* Early termination: in this case, nnz guarantees all edges use strength 2.*/
    if( h->mb.b_transform_8x8 && !CHROMA444 )
    {
        int cbp_mask = 0xf >> CHROMA_V_SHIFT;
        if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask )
        {
            M32( bs[0][0] ) = 0x02020202;
            M32( bs[0][2] ) = 0x02020202;
            M32( bs[0][4] ) = 0x02020202;
            memset( bs[1][0], 2, 5*4*sizeof(uint8_t) ); /* [1][1] and [1][3] has to be set for 4:2:2 */
            return;
        }
    }

    int neighbour_changed = 0;
    if( h->sh.i_disable_deblocking_filter_idc != 2 )
    {
        neighbour_changed = h->mb.i_neighbour_frame&~h->mb.i_neighbour;
        h->mb.i_neighbour = h->mb.i_neighbour_frame;
    }

    /* MBAFF deblock uses different left neighbors from encoding */
    if( SLICE_MBAFF && (h->mb.i_neighbour & MB_LEFT) && (h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED) )
    {
        h->mb.i_mb_left_xy[1] =
        h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
        if( h->mb.i_mb_y&1 )
            h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
        else
            h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
    }

    /* If we have multiple slices and we're deblocking on slice edges, we
     * have to reload neighbour data. */
    if( neighbour_changed )
    {
        int top_y = h->mb.i_mb_top_y;
        int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*h->mb.i_mb_x;
        int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*h->mb.i_mb_x;
        int s8x8 = h->mb.i_b8_stride;
        int s4x4 = h->mb.i_b4_stride;

        uint8_t (*nnz)[48] = h->mb.non_zero_count;
        const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3];

        if( neighbour_changed & MB_TOP )
            CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );

        if( neighbour_changed & MB_LEFT )
        {
            int *left = h->mb.i_mb_left_xy;
            h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]];
            h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]];
            h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]];
            h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]];
        }

        for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
        {
            int16_t (*mv)[2] = h->mb.mv[l];
            int8_t *ref = h->mb.ref[l];

            int i8 = x264_scan8[0] - 8;
            if( neighbour_changed & MB_TOP )
            {
                h->mb.cache.ref[l][i8+0] =
                h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
                h->mb.cache.ref[l][i8+2] =
                h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
                CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
            }

            i8 = x264_scan8[0] - 1;
            if( neighbour_changed & MB_LEFT )
            {
                h->mb.cache.ref[l][i8+0*8] =
                h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table->ref[0]];
                h->mb.cache.ref[l][i8+2*8] =
                h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table->ref[2]];

                CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[0]] );
                CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[1]] );
                CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[2]] );
                CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[3]] );
            }
        }
    }

    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.i_type == SLICE_TYPE_P )
    {
        /* Handle reference frame duplicates */
        int i8 = x264_scan8[0] - 8;
        h->mb.cache.ref[0][i8+0] =
        h->mb.cache.ref[0][i8+1] = deblock_ref_table(h->mb.cache.ref[0][i8+0]);
        h->mb.cache.ref[0][i8+2] =
        h->mb.cache.ref[0][i8+3] = deblock_ref_table(h->mb.cache.ref[0][i8+2]);

        i8 = x264_scan8[0] - 1;
        h->mb.cache.ref[0][i8+0*8] =
        h->mb.cache.ref[0][i8+1*8] = deblock_ref_table(h->mb.cache.ref[0][i8+0*8]);
        h->mb.cache.ref[0][i8+2*8] =
        h->mb.cache.ref[0][i8+3*8] = deblock_ref_table(h->mb.cache.ref[0][i8+2*8]);

        int ref0 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 0]]);
        int ref1 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 4]]);
        int ref2 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 8]]);
        int ref3 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[12]]);
        uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101;
        uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101;

        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*0] ) = reftop;
        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*1] ) = reftop;
        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
    }

    /* Munge NNZ for cavlc + 8x8dct */
    if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
    {
        uint8_t (*nnz)[48] = h->mb.non_zero_count;
        int top = h->mb.i_mb_top_xy;
        int *left = h->mb.i_mb_left_xy;

        if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
        {
            int i8 = x264_scan8[0] - 8;
            int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
            int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
            M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
            M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
        }

        if( h->mb.i_neighbour & MB_LEFT )
        {
            int i8 = x264_scan8[0] - 1;
            if( h->mb.mb_transform_size[left[0]] )
            {
                int nnz_left0 = M16( &nnz[left[0]][2] ) | M16( &nnz[left[0]][6] );
                h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
                h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
            }
            if( h->mb.mb_transform_size[left[1]] )
            {
                int nnz_left1 = M16( &nnz[left[1]][10] ) | M16( &nnz[left[1]][14] );
                h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
                h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
            }
        }

        if( h->mb.b_transform_8x8 )
        {
            int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
            int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
            int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
            int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
            uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
            uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;

            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
        }
    }

    h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
                               bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B );

    if( SLICE_MBAFF )
        x264_macroblock_deblock_strength_mbaff( h, bs );
}

static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
{
    int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16;
    int i_stride = h->fdec->i_stride[i];
    int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED);
    int i_pix_offset = (b_mbaff && MB_INTERLACED)
                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
                     : 16 * mb_x + height * mb_y * i_stride;
    if( b_chroma )
        h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height );
    else
        h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 );
}

static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int mb_y, int b_mbaff )
{
    /* In MBAFF we store the last two rows in intra_border_backup[0] and [1].
     * For progressive mbs this is the bottom two rows, and for interlaced the
     * bottom row of each field. We also store samples needed for the next
     * mbpair in intra_border_backup[2]. */
    int backup_dst = !b_mbaff ? 0 : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2;
    memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) );
    if( CHROMA444 )
    {
        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*sizeof(pixel) );
        memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*sizeof(pixel) );
    }
    else
    {
        int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE;
        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) );
        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) );
    }
    if( b_mbaff )
    {
        if( mb_y&1 )
        {
            int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE;
            backup_dst = MB_INTERLACED ? 2 : 0;
            memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+backup_src, 16*sizeof(pixel) );
            if( CHROMA444 )
            {
                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 16*sizeof(pixel) );
                memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+backup_src, 16*sizeof(pixel) );
            }
            else
            {
                if( CHROMA_FORMAT == CHROMA_420 )
                    backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*sizeof(pixel) );
                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*sizeof(pixel) );
            }
        }
    }
    else
    {
        /* In progressive we update intra_border_backup in-place, so the topleft neighbor will
         * no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */
        h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15];
        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+(15>>CHROMA_H_SHIFT)];
        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+(15>>CHROMA_H_SHIFT)];
    }
}

void x264_macroblock_cache_save( x264_t *h )
{
    const int i_mb_xy = h->mb.i_mb_xy;
    const int i_mb_type = x264_mb_type_fix[h->mb.i_type];
    const int s8x8 = h->mb.i_b8_stride;
    const int s4x4 = h->mb.i_b4_stride;
    const int i_mb_4x4 = h->mb.i_b4_xy;
    const int i_mb_8x8 = h->mb.i_b8_xy;

    /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing. */
    /* By only dereferencing them once, we avoid this issue. */
    int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy];
    uint8_t *nnz = h->mb.non_zero_count[i_mb_xy];

    if( SLICE_MBAFF )
    {
        x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 1 );
        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 1 );
        if( CHROMA444 )
        {
            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 1 );
            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 1 );
        }
        else
            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 1 );
    }
    else
    {
        x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 0 );
        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 0 );
        if( CHROMA444 )
        {
            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 0 );
            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 0 );
        }
        else
            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 0 );
    }

    x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );

    h->mb.type[i_mb_xy] = i_mb_type;
    h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb;
    h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
    h->mb.i_mb_prev_xy = i_mb_xy;

    /* save intra4x4 */
    if( i_mb_type == I_4x4 )
    {
        CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
        M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
    }
    else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
        M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
    else
        M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL;


    if( i_mb_type == I_PCM )
    {
        h->mb.qp[i_mb_xy] = 0;
        h->mb.i_last_dqp = 0;
        h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2;
        h->mb.i_cbp_luma = 0xf;
        h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x700;
        h->mb.b_transform_8x8 = 0;
        for( int i = 0; i < 48; i++ )
            h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16;
    }
    else
    {
        if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
            h->mb.i_qp = h->mb.i_last_qp;
        h->mb.qp[i_mb_xy] = h->mb.i_qp;
        h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp;
        h->mb.i_last_qp = h->mb.i_qp;
    }

    /* save non zero count */
    CP32( &nnz[ 0+0*4], &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
    CP32( &nnz[ 0+1*4], &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
    CP32( &nnz[ 0+2*4], &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
    CP32( &nnz[ 0+3*4], &h->mb.cache.non_zero_count[x264_scan8[10]] );
    CP32( &nnz[16+0*4], &h->mb.cache.non_zero_count[x264_scan8[16+0]] );
    CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] );
    CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] );
    CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] );
    if( CHROMA_FORMAT >= CHROMA_422 )
    {
        CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] );
        CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] );
        CP32( &nnz[32+2*4], &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] );
        CP32( &nnz[32+3*4], &h->mb.cache.non_zero_count[x264_scan8[32+10]] );
    }

    if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
        h->mb.b_transform_8x8 = 0;
    h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8;

    if( h->sh.i_type != SLICE_TYPE_I )
    {
        int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4];
        int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4];
        int8_t *ref0 = &h->mb.ref[0][i_mb_8x8];
        int8_t *ref1 = &h->mb.ref[1][i_mb_8x8];
        if( !IS_INTRA( i_mb_type ) )
        {
            ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
            ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
            ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
            ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
            CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] );
            CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] );
            CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] );
            CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] );
            if( h->sh.i_type == SLICE_TYPE_B )
            {
                ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
                ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
                ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
                ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
                CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] );
                CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] );
                CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] );
                CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] );
            }
        }
        else
        {
            M16( &ref0[0*s8x8] ) = (uint8_t)(-1) * 0x0101;
            M16( &ref0[1*s8x8] ) = (uint8_t)(-1) * 0x0101;
            M128( &mv0[0*s4x4] ) = M128_ZERO;
            M128( &mv0[1*s4x4] ) = M128_ZERO;
            M128( &mv0[2*s4x4] ) = M128_ZERO;
            M128( &mv0[3*s4x4] ) = M128_ZERO;
            if( h->sh.i_type == SLICE_TYPE_B )
            {
                M16( &ref1[0*s8x8] ) = (uint8_t)(-1) * 0x0101;
                M16( &ref1[1*s8x8] ) = (uint8_t)(-1) * 0x0101;
                M128( &mv1[0*s4x4] ) = M128_ZERO;
                M128( &mv1[1*s4x4] ) = M128_ZERO;
                M128( &mv1[2*s4x4] ) = M128_ZERO;
                M128( &mv1[3*s4x4] ) = M128_ZERO;
            }
        }
    }

    if( h->param.b_cabac )
    {
        uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
        uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
        if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
        else
            h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;

        if( (0x3FF30 >> i_mb_type) & 1 ) /* !INTRA && !SKIP && !DIRECT */
        {
            CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] );
            CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
            CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
            CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] );
            if( h->sh.i_type == SLICE_TYPE_B )
            {
                CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] );
                CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
                CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
                CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] );
            }
        }
        else
        {
            M128( mvd0[0] ) = M128_ZERO;
            if( h->sh.i_type == SLICE_TYPE_B )
                M128( mvd1[0] ) = M128_ZERO;
        }

        if( h->sh.i_type == SLICE_TYPE_B )
        {
            if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT )
                h->mb.skipbp[i_mb_xy] = 0xf;
            else if( i_mb_type == B_8x8 )
            {
                int skipbp = ( h->mb.i_sub_partition[0] == D_DIRECT_8x8 ) << 0;
                skipbp    |= ( h->mb.i_sub_partition[1] == D_DIRECT_8x8 ) << 1;
                skipbp    |= ( h->mb.i_sub_partition[2] == D_DIRECT_8x8 ) << 2;
                skipbp    |= ( h->mb.i_sub_partition[3] == D_DIRECT_8x8 ) << 3;
                h->mb.skipbp[i_mb_xy] = skipbp;
            }
            else
                h->mb.skipbp[i_mb_xy] = 0;
        }
    }
}


void x264_macroblock_bipred_init( x264_t *h )
{
    for( int mbfield = 0; mbfield <= SLICE_MBAFF; mbfield++ )
        for( int field = 0; field <= SLICE_MBAFF; field++ )
            for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<mbfield); i_ref0++ )
            {
                x264_frame_t *l0 = h->fref[0][i_ref0>>mbfield];
                int poc0 = l0->i_poc + mbfield*l0->i_delta_poc[field^(i_ref0&1)];
                for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<mbfield); i_ref1++ )
                {
                    int dist_scale_factor;
                    x264_frame_t *l1 = h->fref[1][i_ref1>>mbfield];
                    int cur_poc = h->fdec->i_poc + mbfield*h->fdec->i_delta_poc[field];
                    int poc1 = l1->i_poc + mbfield*l1->i_delta_poc[field^(i_ref1&1)];
                    int td = x264_clip3( poc1 - poc0, -128, 127 );
                    if( td == 0 /* || pic0 is a long-term ref */ )
                        dist_scale_factor = 256;
                    else
                    {
                        int tb = x264_clip3( cur_poc - poc0, -128, 127 );
                        int tx = (16384 + (abs(td) >> 1)) / td;
                        dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
                    }

                    h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = dist_scale_factor;

                    dist_scale_factor >>= 2;
                    if( h->param.analyse.b_weighted_bipred
                          && dist_scale_factor >= -64
                          && dist_scale_factor <= 128 )
                    {
                        h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 64 - dist_scale_factor;
                        // ssse3 implementation of biweight doesn't support the extrema.
                        // if we ever generate them, we'll have to drop that optimization.
                        assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
                    }
                    else
                        h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32;
                }
            }
}

x264-snapshot-20120103-2245-stable/common/display.h0000644000175000001440000000347611700673342021011 0ustar  videolanusers/*****************************************************************************
 * display.h: x11 visualization interface
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Tuukka Toivonen <tuukkat@ee.oulu.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_DISPLAY_H
#define X264_DISPLAY_H

void disp_sync(void);
void disp_setcolor(unsigned char *name);
/* Display a region of byte wide memory as a grayscale image.
 * num is the window to use for displaying. */
void disp_gray(int num, char *data, int width, int height,
               int stride, const unsigned char *title);
void disp_gray_zoom(int num, char *data, int width, int height,
               int stride, const unsigned char *title, int zoom);
void disp_point(int num, int x1, int y1);
void disp_line(int num, int x1, int y1, int x2, int y2);
void disp_rect(int num, int x1, int y1, int x2, int y2);

#endif
x264-snapshot-20120103-2245-stable/common/display-x11.c0000644000175000001440000001754211700673342021412 0ustar  videolanusers/*****************************************************************************
 * display-x11.c: x11 interface
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: Tuukka Toivonen <tuukkat@ee.oulu.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include <X11/Xlib.h>
#include <X11/Xutil.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "common.h"
#include "display.h"

static long event_mask = ConfigureNotify|ExposureMask|KeyPressMask|ButtonPressMask|StructureNotifyMask|ResizeRedirectMask;

static Display *disp_display = NULL;
static struct disp_window
{
    int init;
    Window window;
} disp_window[10];

static inline void disp_chkerror( int cond, char *e )
{
    if( !cond )
        return;
    fprintf( stderr, "error: %s\n", e ? e : "?" );
    abort();
}

static void disp_init_display()
{
    Visual *visual;
    int dpy_class;
    int screen;
    int dpy_depth;

    if( disp_display )
        return;
    memset( &disp_window, 0, sizeof(disp_window) );
    disp_display = XOpenDisplay( "" );
    disp_chkerror( !disp_display, "no display" );
    screen = DefaultScreen( disp_display );
    visual = DefaultVisual( disp_display, screen );
    dpy_class = visual->class;
    dpy_depth = DefaultDepth( disp_display, screen );
    disp_chkerror( !((dpy_class == TrueColor && dpy_depth == 32)
        || (dpy_class == TrueColor && dpy_depth == 24)
        || (dpy_class == TrueColor && dpy_depth == 16)
        || (dpy_class == PseudoColor && dpy_depth == 8)),
        "requires 8 bit PseudoColor or 16/24/32 bit TrueColor display" );
}

static void disp_init_window( int num, int width, int height, const unsigned char *title )
{
    XSetWindowAttributes xswa;
    XEvent xev;
    int screen = DefaultScreen(disp_display);
    Visual *visual = DefaultVisual (disp_display, screen);
    char buf[200];
    Window window;

    if( title )
        snprintf( buf, 200, "%s: %i/disp", title, num );
    else
        snprintf( buf, 200, "%i/disp", num );

    XSizeHints *shint = XAllocSizeHints();
    disp_chkerror( !shint, "memerror" );
    shint->min_width = shint->max_width = shint->width = width;
    shint->min_height = shint->max_height = shint->height = height;
    shint->flags = PSize | PMinSize | PMaxSize;
    disp_chkerror( num < 0 || num >= 10, "bad win num" );
    if( !disp_window[num].init )
    {
        unsigned int mask = 0;
        disp_window[num].init = 1;
        unsigned int bg = WhitePixel( disp_display, screen );
        unsigned int fg = BlackPixel( disp_display, screen );
        int dpy_depth = DefaultDepth( disp_display, screen );
        if( dpy_depth==32 || dpy_depth==24 || dpy_depth==16 )
        {
            mask |= CWColormap;
            xswa.colormap = XCreateColormap( disp_display, DefaultRootWindow( disp_display ), visual, AllocNone );
        }
        xswa.background_pixel = bg;
        xswa.border_pixel = fg;
        xswa.backing_store = Always;
        xswa.backing_planes = -1;
        xswa.bit_gravity = NorthWestGravity;
        mask = CWBackPixel | CWBorderPixel | CWBackingStore | CWBackingPlanes | CWBitGravity;
        window = XCreateWindow( disp_display, DefaultRootWindow( disp_display ),
                                shint->x, shint->y, shint->width, shint->height,
                                1, dpy_depth, InputOutput, visual, mask, &xswa );
        disp_window[num].window = window;

        XSelectInput( disp_display, window, event_mask );
        XSetStandardProperties( disp_display, window, buf, buf, None, NULL, 0, shint );
        XMapWindow( disp_display, window );

        do {
            XNextEvent( disp_display, &xev );
        } while( xev.type != MapNotify || xev.xmap.event != window );
    }
    window = disp_window[num].window;
    XSetStandardProperties( disp_display, window, buf, buf, None, NULL, 0, shint );
    XResizeWindow( disp_display, window, width, height );
    XSync( disp_display, 1 );
    XFree( shint );
}

void disp_sync()
{
    XSync( disp_display, 1 );
}

void disp_setcolor( unsigned char *name )
{
    XColor c_exact, c_nearest;

    int screen = DefaultScreen( disp_display );
    GC gc = DefaultGC( disp_display, screen );
    Colormap cm = DefaultColormap( disp_display, screen );
    Status st = XAllocNamedColor( disp_display, cm, name, &c_nearest, &c_exact );
    disp_chkerror( st != 1, "XAllocNamedColor error" );
    XSetForeground( disp_display, gc, c_nearest.pixel );
}

void disp_gray( int num, char *data, int width, int height, int stride, const unsigned char *title )
{
    char dummy;

    disp_init_display();
    disp_init_window( num, width, height, title );
    int screen = DefaultScreen( disp_display );
    Visual *visual = DefaultVisual( disp_display, screen );
    int dpy_depth = DefaultDepth( disp_display, screen );
    XImage *ximage = XCreateImage( disp_display, visual, dpy_depth, ZPixmap, 0, &dummy, width, height, 8, 0 );
    disp_chkerror( !ximage, "no ximage" );
#if WORDS_BIGENDIAN
    ximage->byte_order = MSBFirst;
    ximage->bitmap_bit_order = MSBFirst;
#else
    ximage->byte_order = LSBFirst;
    ximage->bitmap_bit_order = LSBFirst;
#endif

    int pixelsize = dpy_depth>8 ? sizeof(int) : sizeof(unsigned char);
    uint8_t *image = malloc( width * height * pixelsize );
    disp_chkerror( !image, "malloc failed" );
    for( int y = 0; y < height; y++ )
        for( int x = 0; x < width; x++ )
            memset( &image[(width*y + x)*pixelsize], data[y*stride+x], pixelsize );
    ximage->data = image;
    GC gc = DefaultGC( disp_display, screen );

    XPutImage( disp_display, disp_window[num].window, gc, ximage, 0, 0, 0, 0, width, height );
    XPutImage( disp_display, disp_window[num].window, gc, ximage, 0, 0, 0, 0, width, height );

    XDestroyImage( ximage );
    XSync( disp_display, 1 );

}

void disp_gray_zoom(int num, char *data, int width, int height, int stride, const unsigned char *title, int zoom)
{
    unsigned char *dataz = malloc( width*zoom * height*zoom );
    disp_chkerror( !dataz, "malloc" );
    for( int y = 0; y < height; y++ )
        for( int x = 0; x < width; x++ )
            for( int y0 = 0; y0 < zoom; y0++ )
                for( int x0 = 0; x0 < zoom; x0++ )
                    dataz[(y*zoom + y0)*width*zoom + x*zoom + x0] = data[y*stride+x];
    disp_gray( num, dataz, width*zoom, height*zoom, width*zoom, title );
    free( dataz );
}

void disp_point( int num, int x1, int y1 )
{
    int screen = DefaultScreen( disp_display );
    GC gc = DefaultGC( disp_display, screen );
    XDrawPoint( disp_display, disp_window[num].window, gc, x1, y1 );
}

void disp_line( int num, int x1, int y1, int x2, int y2 )
{
    int screen = DefaultScreen( disp_display );
    GC gc = DefaultGC( disp_display, screen );
    XDrawLine( disp_display, disp_window[num].window, gc, x1, y1, x2, y2 );
}

void disp_rect( int num, int x1, int y1, int x2, int y2 )
{
    int screen = DefaultScreen( disp_display );
    GC gc = DefaultGC( disp_display, screen );
    XDrawRectangle( disp_display, disp_window[num].window, gc, x1, y1, x2-x1, y2-y1 );
}
x264-snapshot-20120103-2245-stable/common/dct.h0000644000175000001440000001245011700673342020106 0ustar  videolanusers/*****************************************************************************
 * dct.h: transform and zigzag
 *****************************************************************************
 * Copyright (C) 2004-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_DCT_H
#define X264_DCT_H

/* the inverse of the scaling factors introduced by 8x8 fdct */
#define W(i) (i==0 ? FIX8(1.0000) :\
              i==1 ? FIX8(0.8859) :\
              i==2 ? FIX8(1.6000) :\
              i==3 ? FIX8(0.9415) :\
              i==4 ? FIX8(1.2651) :\
              i==5 ? FIX8(1.1910) :0)
static const uint16_t x264_dct8_weight_tab[64] = {
    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),

    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
};
#undef W

#define W(i) (i==0 ? FIX8(1.76777) :\
              i==1 ? FIX8(1.11803) :\
              i==2 ? FIX8(0.70711) :0)
static const uint16_t x264_dct4_weight_tab[16] = {
    W(0), W(1), W(0), W(1),
    W(1), W(2), W(1), W(2),
    W(0), W(1), W(0), W(1),
    W(1), W(2), W(1), W(2)
};
#undef W

/* inverse squared */
#define W(i) (i==0 ? FIX8(3.125) :\
              i==1 ? FIX8(1.25) :\
              i==2 ? FIX8(0.5) :0)
static const uint16_t x264_dct4_weight2_tab[16] = {
    W(0), W(1), W(0), W(1),
    W(1), W(2), W(1), W(2),
    W(0), W(1), W(0), W(1),
    W(1), W(2), W(1), W(2)
};
#undef W

#define W(i) (i==0 ? FIX8(1.00000) :\
              i==1 ? FIX8(0.78487) :\
              i==2 ? FIX8(2.56132) :\
              i==3 ? FIX8(0.88637) :\
              i==4 ? FIX8(1.60040) :\
              i==5 ? FIX8(1.41850) :0)
static const uint16_t x264_dct8_weight2_tab[64] = {
    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),

    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
};
#undef W

extern uint16_t x264_dct4_weight2_zigzag[2][16]; // [2] = {frame, field}
extern uint16_t x264_dct8_weight2_zigzag[2][64];

typedef struct
{
    // pix1  stride = FENC_STRIDE
    // pix2  stride = FDEC_STRIDE
    // p_dst stride = FDEC_STRIDE
    void (*sub4x4_dct)   ( dctcoef dct[16], pixel *pix1, pixel *pix2 );
    void (*add4x4_idct)  ( pixel *p_dst, dctcoef dct[16] );

    void (*sub8x8_dct)   ( dctcoef dct[4][16], pixel *pix1, pixel *pix2 );
    void (*sub8x8_dct_dc)( dctcoef dct[4], pixel *pix1, pixel *pix2 );
    void (*add8x8_idct)  ( pixel *p_dst, dctcoef dct[4][16] );
    void (*add8x8_idct_dc) ( pixel *p_dst, dctcoef dct[4] );

    void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );

    void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
    void (*add16x16_idct)( pixel *p_dst, dctcoef dct[16][16] );
    void (*add16x16_idct_dc) ( pixel *p_dst, dctcoef dct[16] );

    void (*sub8x8_dct8)  ( dctcoef dct[64], pixel *pix1, pixel *pix2 );
    void (*add8x8_idct8) ( pixel *p_dst, dctcoef dct[64] );

    void (*sub16x16_dct8) ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
    void (*add16x16_idct8)( pixel *p_dst, dctcoef dct[4][64] );

    void (*dct4x4dc) ( dctcoef d[16] );
    void (*idct4x4dc)( dctcoef d[16] );

    void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );

} x264_dct_function_t;

typedef struct
{
    void (*scan_8x8)( dctcoef level[64], dctcoef dct[64] );
    void (*scan_4x4)( dctcoef level[16], dctcoef dct[16] );
    int  (*sub_8x8)  ( dctcoef level[64], const pixel *p_src, pixel *p_dst );
    int  (*sub_4x4)  ( dctcoef level[16], const pixel *p_src, pixel *p_dst );
    int  (*sub_4x4ac)( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
    void (*interleave_8x8_cavlc)( dctcoef *dst, dctcoef *src, uint8_t *nnz );

} x264_zigzag_function_t;

void x264_dct_init( int cpu, x264_dct_function_t *dctf );
void x264_dct_init_weights( void );
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced );

#endif
x264-snapshot-20120103-2245-stable/common/dct.c0000644000175000001440000007237011700673342020110 0ustar  videolanusers/*****************************************************************************
 * dct.c: transform and zigzag
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Henrik Gramner <hengar-6@student.ltu.se>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"
#if HAVE_MMX
#   include "x86/dct.h"
#endif
#if ARCH_PPC
#   include "ppc/dct.h"
#endif
#if ARCH_ARM
#   include "arm/dct.h"
#endif

uint16_t x264_dct4_weight2_zigzag[2][16];
uint16_t x264_dct8_weight2_zigzag[2][64];

static void dct4x4dc( dctcoef d[16] )
{
    dctcoef tmp[16];

    for( int i = 0; i < 4; i++ )
    {
        int s01 = d[i*4+0] + d[i*4+1];
        int d01 = d[i*4+0] - d[i*4+1];
        int s23 = d[i*4+2] + d[i*4+3];
        int d23 = d[i*4+2] - d[i*4+3];

        tmp[0*4+i] = s01 + s23;
        tmp[1*4+i] = s01 - s23;
        tmp[2*4+i] = d01 - d23;
        tmp[3*4+i] = d01 + d23;
    }

    for( int i = 0; i < 4; i++ )
    {
        int s01 = tmp[i*4+0] + tmp[i*4+1];
        int d01 = tmp[i*4+0] - tmp[i*4+1];
        int s23 = tmp[i*4+2] + tmp[i*4+3];
        int d23 = tmp[i*4+2] - tmp[i*4+3];

        d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
        d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
        d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
        d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
    }
}

static void idct4x4dc( dctcoef d[16] )
{
    dctcoef tmp[16];

    for( int i = 0; i < 4; i++ )
    {
        int s01 = d[i*4+0] + d[i*4+1];
        int d01 = d[i*4+0] - d[i*4+1];
        int s23 = d[i*4+2] + d[i*4+3];
        int d23 = d[i*4+2] - d[i*4+3];

        tmp[0*4+i] = s01 + s23;
        tmp[1*4+i] = s01 - s23;
        tmp[2*4+i] = d01 - d23;
        tmp[3*4+i] = d01 + d23;
    }

    for( int i = 0; i < 4; i++ )
    {
        int s01 = tmp[i*4+0] + tmp[i*4+1];
        int d01 = tmp[i*4+0] - tmp[i*4+1];
        int s23 = tmp[i*4+2] + tmp[i*4+3];
        int d23 = tmp[i*4+2] - tmp[i*4+3];

        d[i*4+0] = s01 + s23;
        d[i*4+1] = s01 - s23;
        d[i*4+2] = d01 - d23;
        d[i*4+3] = d01 + d23;
    }
}

static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
{
    int a0 = dct4x4[0][0] + dct4x4[1][0];
    int a1 = dct4x4[2][0] + dct4x4[3][0];
    int a2 = dct4x4[4][0] + dct4x4[5][0];
    int a3 = dct4x4[6][0] + dct4x4[7][0];
    int a4 = dct4x4[0][0] - dct4x4[1][0];
    int a5 = dct4x4[2][0] - dct4x4[3][0];
    int a6 = dct4x4[4][0] - dct4x4[5][0];
    int a7 = dct4x4[6][0] - dct4x4[7][0];
    int b0 = a0 + a1;
    int b1 = a2 + a3;
    int b2 = a4 + a5;
    int b3 = a6 + a7;
    int b4 = a0 - a1;
    int b5 = a2 - a3;
    int b6 = a4 - a5;
    int b7 = a6 - a7;
    dct[0] = b0 + b1;
    dct[1] = b2 + b3;
    dct[2] = b0 - b1;
    dct[3] = b2 - b3;
    dct[4] = b4 - b5;
    dct[5] = b6 - b7;
    dct[6] = b4 + b5;
    dct[7] = b6 + b7;
    dct4x4[0][0] = 0;
    dct4x4[1][0] = 0;
    dct4x4[2][0] = 0;
    dct4x4[3][0] = 0;
    dct4x4[4][0] = 0;
    dct4x4[5][0] = 0;
    dct4x4[6][0] = 0;
    dct4x4[7][0] = 0;
}

static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
                                  pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
    for( int y = 0; y < i_size; y++ )
    {
        for( int x = 0; x < i_size; x++ )
            diff[x + y*i_size] = pix1[x] - pix2[x];
        pix1 += i_pix1;
        pix2 += i_pix2;
    }
}

static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
{
    dctcoef d[16];
    dctcoef tmp[16];

    pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );

    for( int i = 0; i < 4; i++ )
    {
        int s03 = d[i*4+0] + d[i*4+3];
        int s12 = d[i*4+1] + d[i*4+2];
        int d03 = d[i*4+0] - d[i*4+3];
        int d12 = d[i*4+1] - d[i*4+2];

        tmp[0*4+i] =   s03 +   s12;
        tmp[1*4+i] = 2*d03 +   d12;
        tmp[2*4+i] =   s03 -   s12;
        tmp[3*4+i] =   d03 - 2*d12;
    }

    for( int i = 0; i < 4; i++ )
    {
        int s03 = tmp[i*4+0] + tmp[i*4+3];
        int s12 = tmp[i*4+1] + tmp[i*4+2];
        int d03 = tmp[i*4+0] - tmp[i*4+3];
        int d12 = tmp[i*4+1] - tmp[i*4+2];

        dct[i*4+0] =   s03 +   s12;
        dct[i*4+1] = 2*d03 +   d12;
        dct[i*4+2] =   s03 -   s12;
        dct[i*4+3] =   d03 - 2*d12;
    }
}

static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
{
    sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
    sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
    sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
    sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}

static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
{
    sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
    sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
    sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
    sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}

static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
{
    int sum = 0;
    for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
        sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
             - pix2[0] - pix2[1] - pix2[2] - pix2[3];
    return sum;
}

static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
{
    dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
    dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
    dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
    dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );

    /* 2x2 DC transform */
    int d0 = dct[0] + dct[1];
    int d1 = dct[2] + dct[3];
    int d2 = dct[0] - dct[1];
    int d3 = dct[2] - dct[3];
    dct[0] = d0 + d1;
    dct[1] = d0 - d1;
    dct[2] = d2 + d3;
    dct[3] = d2 - d3;
}

static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
{
    int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
    int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
    int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
    int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
    int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
    int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
    int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
    int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );

    /* 2x4 DC transform */
    int b0 = a0 + a1;
    int b1 = a2 + a3;
    int b2 = a4 + a5;
    int b3 = a6 + a7;
    int b4 = a0 - a1;
    int b5 = a2 - a3;
    int b6 = a4 - a5;
    int b7 = a6 - a7;
    a0 = b0 + b1;
    a1 = b2 + b3;
    a2 = b4 + b5;
    a3 = b6 + b7;
    a4 = b0 - b1;
    a5 = b2 - b3;
    a6 = b4 - b5;
    a7 = b6 - b7;
    dct[0] = a0 + a1;
    dct[1] = a2 + a3;
    dct[2] = a0 - a1;
    dct[3] = a2 - a3;
    dct[4] = a4 - a5;
    dct[5] = a6 - a7;
    dct[6] = a4 + a5;
    dct[7] = a6 + a7;
}

static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
{
    dctcoef d[16];
    dctcoef tmp[16];

    for( int i = 0; i < 4; i++ )
    {
        int s02 =  dct[0*4+i]     +  dct[2*4+i];
        int d02 =  dct[0*4+i]     -  dct[2*4+i];
        int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
        int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];

        tmp[i*4+0] = s02 + s13;
        tmp[i*4+1] = d02 + d13;
        tmp[i*4+2] = d02 - d13;
        tmp[i*4+3] = s02 - s13;
    }

    for( int i = 0; i < 4; i++ )
    {
        int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
        int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
        int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
        int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];

        d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
        d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
        d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
        d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
    }


    for( int y = 0; y < 4; y++ )
    {
        for( int x = 0; x < 4; x++ )
            p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
        p_dst += FDEC_STRIDE;
    }
}

static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
{
    add4x4_idct( &p_dst[0],               dct[0] );
    add4x4_idct( &p_dst[4],               dct[1] );
    add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
    add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}

static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
{
    add8x8_idct( &p_dst[0],               &dct[0] );
    add8x8_idct( &p_dst[8],               &dct[4] );
    add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
    add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
}

/****************************************************************************
 * 8x8 transform:
 ****************************************************************************/

#define DCT8_1D {\
    int s07 = SRC(0) + SRC(7);\
    int s16 = SRC(1) + SRC(6);\
    int s25 = SRC(2) + SRC(5);\
    int s34 = SRC(3) + SRC(4);\
    int a0 = s07 + s34;\
    int a1 = s16 + s25;\
    int a2 = s07 - s34;\
    int a3 = s16 - s25;\
    int d07 = SRC(0) - SRC(7);\
    int d16 = SRC(1) - SRC(6);\
    int d25 = SRC(2) - SRC(5);\
    int d34 = SRC(3) - SRC(4);\
    int a4 = d16 + d25 + (d07 + (d07>>1));\
    int a5 = d07 - d34 - (d25 + (d25>>1));\
    int a6 = d07 + d34 - (d16 + (d16>>1));\
    int a7 = d16 - d25 + (d34 + (d34>>1));\
    DST(0) =  a0 + a1     ;\
    DST(1) =  a4 + (a7>>2);\
    DST(2) =  a2 + (a3>>1);\
    DST(3) =  a5 + (a6>>2);\
    DST(4) =  a0 - a1     ;\
    DST(5) =  a6 - (a5>>2);\
    DST(6) = (a2>>1) - a3 ;\
    DST(7) = (a4>>2) - a7 ;\
}

static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
{
    dctcoef tmp[64];

    pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );

#define SRC(x) tmp[x*8+i]
#define DST(x) tmp[x*8+i]
    for( int i = 0; i < 8; i++ )
        DCT8_1D
#undef SRC
#undef DST

#define SRC(x) tmp[i*8+x]
#define DST(x) dct[x*8+i]
    for( int i = 0; i < 8; i++ )
        DCT8_1D
#undef SRC
#undef DST
}

static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
{
    sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
    sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
    sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
    sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}

#define IDCT8_1D {\
    int a0 =  SRC(0) + SRC(4);\
    int a2 =  SRC(0) - SRC(4);\
    int a4 = (SRC(2)>>1) - SRC(6);\
    int a6 = (SRC(6)>>1) + SRC(2);\
    int b0 = a0 + a6;\
    int b2 = a2 + a4;\
    int b4 = a2 - a4;\
    int b6 = a0 - a6;\
    int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
    int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
    int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
    int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
    int b1 = (a7>>2) + a1;\
    int b3 =  a3 + (a5>>2);\
    int b5 = (a3>>2) - a5;\
    int b7 =  a7 - (a1>>2);\
    DST(0, b0 + b7);\
    DST(1, b2 + b5);\
    DST(2, b4 + b3);\
    DST(3, b6 + b1);\
    DST(4, b6 - b1);\
    DST(5, b4 - b3);\
    DST(6, b2 - b5);\
    DST(7, b0 - b7);\
}

static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
{
    dct[0] += 32; // rounding for the >>6 at the end

#define SRC(x)     dct[x*8+i]
#define DST(x,rhs) dct[x*8+i] = (rhs)
    for( int i = 0; i < 8; i++ )
        IDCT8_1D
#undef SRC
#undef DST

#define SRC(x)     dct[i*8+x]
#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
    for( int i = 0; i < 8; i++ )
        IDCT8_1D
#undef SRC
#undef DST
}

static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
{
    add8x8_idct8( &dst[0],               dct[0] );
    add8x8_idct8( &dst[8],               dct[1] );
    add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
    add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
}

static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
{
    dc = (dc + 32) >> 6;
    for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
    {
        p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
        p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
        p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
        p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
    }
}

static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
{
    add4x4_idct_dc( &p_dst[0],               dct[0] );
    add4x4_idct_dc( &p_dst[4],               dct[1] );
    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}

static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
{
    for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
    {
        add4x4_idct_dc( &p_dst[ 0], dct[0] );
        add4x4_idct_dc( &p_dst[ 4], dct[1] );
        add4x4_idct_dc( &p_dst[ 8], dct[2] );
        add4x4_idct_dc( &p_dst[12], dct[3] );
    }
}


/****************************************************************************
 * x264_dct_init:
 ****************************************************************************/
void x264_dct_init( int cpu, x264_dct_function_t *dctf )
{
    dctf->sub4x4_dct    = sub4x4_dct;
    dctf->add4x4_idct   = add4x4_idct;

    dctf->sub8x8_dct    = sub8x8_dct;
    dctf->sub8x8_dct_dc = sub8x8_dct_dc;
    dctf->add8x8_idct   = add8x8_idct;
    dctf->add8x8_idct_dc = add8x8_idct_dc;

    dctf->sub8x16_dct_dc = sub8x16_dct_dc;

    dctf->sub16x16_dct  = sub16x16_dct;
    dctf->add16x16_idct = add16x16_idct;
    dctf->add16x16_idct_dc = add16x16_idct_dc;

    dctf->sub8x8_dct8   = sub8x8_dct8;
    dctf->add8x8_idct8  = add8x8_idct8;

    dctf->sub16x16_dct8  = sub16x16_dct8;
    dctf->add16x16_idct8 = add16x16_idct8;

    dctf->dct4x4dc  = dct4x4dc;
    dctf->idct4x4dc = idct4x4dc;

    dctf->dct2x4dc = dct2x4dc;

#if HIGH_BIT_DEPTH
#if HAVE_MMX
    if( cpu&X264_CPU_MMX )
    {
        dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
    }
    if( cpu&X264_CPU_SSE2 )
    {
        dctf->add4x4_idct     = x264_add4x4_idct_sse2;
        dctf->dct4x4dc        = x264_dct4x4dc_sse2;
        dctf->idct4x4dc       = x264_idct4x4dc_sse2;
        dctf->add8x8_idct     = x264_add8x8_idct_sse2;
        dctf->add16x16_idct   = x264_add16x16_idct_sse2;
        dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_sse2;
        dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
    }
    if( cpu&X264_CPU_AVX )
    {
        dctf->add4x4_idct     = x264_add4x4_idct_avx;
        dctf->dct4x4dc        = x264_dct4x4dc_avx;
        dctf->idct4x4dc       = x264_idct4x4dc_avx;
        dctf->add8x8_idct     = x264_add8x8_idct_avx;
        dctf->add16x16_idct   = x264_add16x16_idct_avx;
        dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_avx;
        dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
    }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
    if( cpu&X264_CPU_MMX )
    {
        dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
        dctf->add4x4_idct   = x264_add4x4_idct_mmx;
        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
        dctf->dct4x4dc      = x264_dct4x4dc_mmx;
        dctf->idct4x4dc     = x264_idct4x4dc_mmx;
        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;

#if !ARCH_X86_64
        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
        dctf->add8x8_idct   = x264_add8x8_idct_mmx;
        dctf->add16x16_idct = x264_add16x16_idct_mmx;

        dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
        dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
        dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
#endif
    }

    if( cpu&X264_CPU_SSE2 )
    {
        dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
        dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
        dctf->add16x16_idct8= x264_add16x16_idct8_sse2;

        dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
        dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
        dctf->add8x8_idct   = x264_add8x8_idct_sse2;
        dctf->add16x16_idct = x264_add16x16_idct_sse2;
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
    }

    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
    {
        dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
        dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
        dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
        dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
    }

    if( cpu&X264_CPU_SSE4 )
        dctf->add4x4_idct   = x264_add4x4_idct_sse4;

    if( cpu&X264_CPU_AVX )
    {
        dctf->add4x4_idct      = x264_add4x4_idct_avx;
        dctf->add8x8_idct      = x264_add8x8_idct_avx;
        dctf->add16x16_idct    = x264_add16x16_idct_avx;
        dctf->add8x8_idct8     = x264_add8x8_idct8_avx;
        dctf->add16x16_idct8   = x264_add16x16_idct8_avx;
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
        dctf->sub8x8_dct       = x264_sub8x8_dct_avx;
        dctf->sub16x16_dct     = x264_sub16x16_dct_avx;
        dctf->sub8x8_dct8      = x264_sub8x8_dct8_avx;
        dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx;
    }
#endif //HAVE_MMX

#if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
    {
        dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
        dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
        dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;

        dctf->add4x4_idct   = x264_add4x4_idct_altivec;
        dctf->add8x8_idct   = x264_add8x8_idct_altivec;
        dctf->add16x16_idct = x264_add16x16_idct_altivec;

        dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;

        dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
        dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
    }
#endif

#if HAVE_ARMV6
    if( cpu&X264_CPU_NEON )
    {
        dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
        dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
        dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
        dctf->dct4x4dc      = x264_dct4x4dc_neon;
        dctf->idct4x4dc     = x264_idct4x4dc_neon;

        dctf->add4x4_idct   = x264_add4x4_idct_neon;
        dctf->add8x8_idct   = x264_add8x8_idct_neon;
        dctf->add16x16_idct = x264_add16x16_idct_neon;

        dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;

        dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
        dctf->add16x16_idct8= x264_add16x16_idct8_neon;
    }
#endif
#endif // HIGH_BIT_DEPTH
}

void x264_dct_init_weights( void )
{
    for( int j = 0; j < 2; j++ )
    {
        for( int i = 0; i < 16; i++ )
            x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
        for( int i = 0; i < 64; i++ )
            x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
    }
}


#define ZIG(i,y,x) level[i] = dct[x*8+y];
#define ZIGZAG8_FRAME\
    ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
    ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
    ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
    ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
    ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
    ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
    ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
    ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
    ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
    ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
    ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
    ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
    ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
    ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
    ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
    ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\

#define ZIGZAG8_FIELD\
    ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
    ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
    ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
    ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
    ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
    ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
    ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
    ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
    ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
    ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
    ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
    ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
    ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
    ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
    ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
    ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)

#define ZIGZAG4_FRAME\
    ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
    ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
    ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
    ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)

#define ZIGZAG4_FIELD\
    ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
    ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
    ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
    ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)

static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
{
    ZIGZAG8_FRAME
}

static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
{
    ZIGZAG8_FIELD
}

#undef ZIG
#define ZIG(i,y,x) level[i] = dct[x*4+y];
#define ZIGDC(i,y,x) ZIG(i,y,x)

static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
{
    ZIGZAG4_FRAME
}

static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
{
    memcpy( level, dct, 2 * sizeof(dctcoef) );
    ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
    memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
}

#undef ZIG
#define ZIG(i,y,x) {\
    int oe = x+y*FENC_STRIDE;\
    int od = x+y*FDEC_STRIDE;\
    level[i] = p_src[oe] - p_dst[od];\
    nz |= level[i];\
}
#define COPY4x4\
    CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
    CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
    CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
    CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
#define COPY8x8\
    CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
    CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
    CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
    CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
    CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
    CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
    CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
    CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );

static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
{
    int nz = 0;
    ZIGZAG4_FRAME
    COPY4x4
    return !!nz;
}

static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
{
    int nz = 0;
    ZIGZAG4_FIELD
    COPY4x4
    return !!nz;
}

#undef ZIGDC
#define ZIGDC(i,y,x) {\
    int oe = x+y*FENC_STRIDE;\
    int od = x+y*FDEC_STRIDE;\
    *dc = p_src[oe] - p_dst[od];\
    level[0] = 0;\
}

static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
{
    int nz = 0;
    ZIGZAG4_FRAME
    COPY4x4
    return !!nz;
}

static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
{
    int nz = 0;
    ZIGZAG4_FIELD
    COPY4x4
    return !!nz;
}

static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
{
    int nz = 0;
    ZIGZAG8_FRAME
    COPY8x8
    return !!nz;
}
static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
{
    int nz = 0;
    ZIGZAG8_FIELD
    COPY8x8
    return !!nz;
}

#undef ZIG
#undef COPY4x4

static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
{
    for( int i = 0; i < 4; i++ )
    {
        int nz = 0;
        for( int j = 0; j < 16; j++ )
        {
            nz |= src[i+j*4];
            dst[i*16+j] = src[i+j*4];
        }
        nnz[(i&1) + (i>>1)*8] = !!nz;
    }
}

void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
{
    pf_interlaced->scan_8x8   = zigzag_scan_8x8_field;
    pf_progressive->scan_8x8  = zigzag_scan_8x8_frame;
    pf_interlaced->scan_4x4   = zigzag_scan_4x4_field;
    pf_progressive->scan_4x4  = zigzag_scan_4x4_frame;
    pf_interlaced->sub_8x8    = zigzag_sub_8x8_field;
    pf_progressive->sub_8x8   = zigzag_sub_8x8_frame;
    pf_interlaced->sub_4x4    = zigzag_sub_4x4_field;
    pf_progressive->sub_4x4   = zigzag_sub_4x4_frame;
    pf_interlaced->sub_4x4ac  = zigzag_sub_4x4ac_field;
    pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;

#if HIGH_BIT_DEPTH
#if HAVE_MMX
    if( cpu&X264_CPU_SSE2 )
    {
        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse2;
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
    }
    if( cpu&X264_CPU_SSE4 )
        pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
    if( cpu&X264_CPU_AVX )
        pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
#if ARCH_X86_64
    if( cpu&X264_CPU_AVX )
    {
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
    }
#endif // ARCH_X86_64
#endif // HAVE_MMX
#else
#if HAVE_MMX
    if( cpu&X264_CPU_MMX )
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
    if( cpu&X264_CPU_MMX2 )
    {
        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_mmx2;
        pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_mmx2;
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
    }
    if( cpu&X264_CPU_SSE2_IS_FAST )
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
    if( cpu&X264_CPU_SSSE3 )
    {
        pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_ssse3;
        pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
        pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
        pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
            pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
    }
    if( cpu&X264_CPU_AVX )
    {
        pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_avx;
        pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_avx;
#if ARCH_X86_64
        pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
        pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
#endif
        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
            pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
    }
    if( cpu&X264_CPU_XOP )
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
#endif // HAVE_MMX
#if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
    {
        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_altivec;
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
    }
#endif
#if HAVE_ARMV6
    if( cpu&X264_CPU_NEON )
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
#endif
#endif // HIGH_BIT_DEPTH

    pf_interlaced->interleave_8x8_cavlc =
    pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
#if HAVE_MMX
#if HIGH_BIT_DEPTH
    if( cpu&X264_CPU_SSE2 )
    {
        pf_interlaced->interleave_8x8_cavlc =
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
    }
    if( cpu&X264_CPU_AVX )
    {
        pf_interlaced->interleave_8x8_cavlc =
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
    }
#else
    if( cpu&X264_CPU_MMX )
    {
        pf_interlaced->interleave_8x8_cavlc =
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
    }
    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
    {
        pf_interlaced->interleave_8x8_cavlc =
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
    }

    if( cpu&X264_CPU_AVX )
    {
        pf_interlaced->interleave_8x8_cavlc =
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
    }
#endif // HIGH_BIT_DEPTH
#endif
}
x264-snapshot-20120103-2245-stable/common/cpu.h0000644000175000001440000000546711700673342020135 0ustar  videolanusers/*****************************************************************************
 * cpu.h: cpu detection
 *****************************************************************************
 * Copyright (C) 2004-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_CPU_H
#define X264_CPU_H

uint32_t x264_cpu_detect( void );
int      x264_cpu_num_processors( void );
void     x264_cpu_emms( void );
void     x264_cpu_sfence( void );
#if HAVE_MMX
/* There is no way to forbid the compiler from using float instructions
 * before the emms so miscompilation could theoretically occur in the
 * unlikely event that the compiler reorders emms and float instructions. */
#if HAVE_X86_INLINE_ASM
/* Clobbering memory makes the compiler less likely to reorder code. */
#define x264_emms() asm volatile( "emms":::"memory","st","st(1)","st(2)", \
                                  "st(3)","st(4)","st(5)","st(6)","st(7)" )
#else
#define x264_emms() x264_cpu_emms()
#endif
#else
#define x264_emms()
#endif
#define x264_sfence x264_cpu_sfence
void     x264_cpu_mask_misalign_sse( void );

/* kluge:
 * gcc can't give variables any greater alignment than the stack frame has.
 * We need 16 byte alignment for SSE2, so here we make sure that the stack is
 * aligned to 16 bytes.
 * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
 * problem, but I don't want to require such a new version.
 * This applies only to x86_32, since other architectures that need alignment
 * either have ABIs that ensure aligned stack, or don't support it at all. */
#if ARCH_X86 && HAVE_MMX
int x264_stack_align( void (*func)(), ... );
#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
#else
#define x264_stack_align(func,...) func(__VA_ARGS__)
#endif

typedef struct
{
    const char name[16];
    uint32_t flags;
} x264_cpu_name_t;
extern const x264_cpu_name_t x264_cpu_names[];

#endif
x264-snapshot-20120103-2245-stable/common/cpu.c0000644000175000001440000003066011700673342020121 0ustar  videolanusers/*****************************************************************************
 * cpu.c: cpu detection
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#define _GNU_SOURCE // for sched_getaffinity
#include "common.h"
#include "cpu.h"

#if HAVE_POSIXTHREAD && SYS_LINUX
#include <sched.h>
#endif
#if SYS_BEOS
#include <kernel/OS.h>
#endif
#if SYS_MACOSX || SYS_FREEBSD
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if SYS_OPENBSD
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif

const x264_cpu_name_t x264_cpu_names[] =
{
    {"Altivec",     X264_CPU_ALTIVEC},
//  {"MMX",         X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
    {"MMX2",        X264_CPU_MMX|X264_CPU_MMX2},
    {"MMXEXT",      X264_CPU_MMX|X264_CPU_MMX2},
//  {"SSE",         X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264
#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
    {"SSE2Slow",    SSE2|X264_CPU_SSE2_IS_SLOW},
    {"SSE2",        SSE2},
    {"SSE2Fast",    SSE2|X264_CPU_SSE2_IS_FAST},
    {"SSE3",        SSE2|X264_CPU_SSE3},
    {"SSSE3",       SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
    {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST},
    {"SSE4.1",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
    {"SSE4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
    {"SSE4.2",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
    {"AVX",         SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
    {"XOP",         SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_XOP},
    {"FMA4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_FMA4},
#undef SSE2
    {"Cache32",         X264_CPU_CACHELINE_32},
    {"Cache64",         X264_CPU_CACHELINE_64},
    {"SSEMisalign",     X264_CPU_SSE_MISALIGN},
    {"LZCNT",           X264_CPU_LZCNT},
    {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
    {"ARMv6",           X264_CPU_ARMV6},
    {"NEON",            X264_CPU_NEON},
    {"Fast_NEON_MRC",   X264_CPU_FAST_NEON_MRC},
    {"SlowCTZ",         X264_CPU_SLOW_CTZ},
    {"SlowAtom",        X264_CPU_SLOW_ATOM},
    {"", 0},
};

#if (ARCH_PPC && SYS_LINUX) || (ARCH_ARM && !HAVE_NEON)
#include <signal.h>
#include <setjmp.h>
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t canjump = 0;

static void sigill_handler( int sig )
{
    if( !canjump )
    {
        signal( sig, SIG_DFL );
        raise( sig );
    }

    canjump = 0;
    siglongjmp( jmpbuf, 1 );
}
#endif

#if HAVE_MMX
int x264_cpu_cpuid_test( void );
void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );

uint32_t x264_cpu_detect( void )
{
    uint32_t cpu = 0;
    uint32_t eax, ebx, ecx, edx;
    uint32_t vendor[4] = {0};
    uint32_t max_extended_cap;
    int cache;

#if !ARCH_X86_64
    if( !x264_cpu_cpuid_test() )
        return 0;
#endif

    x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
    if( eax == 0 )
        return 0;

    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
    if( edx&0x00800000 )
        cpu |= X264_CPU_MMX;
    else
        return 0;
    if( edx&0x02000000 )
        cpu |= X264_CPU_MMX2|X264_CPU_SSE;
    if( edx&0x04000000 )
        cpu |= X264_CPU_SSE2;
    if( ecx&0x00000001 )
        cpu |= X264_CPU_SSE3;
    if( ecx&0x00000200 )
        cpu |= X264_CPU_SSSE3;
    if( ecx&0x00080000 )
        cpu |= X264_CPU_SSE4;
    if( ecx&0x00100000 )
        cpu |= X264_CPU_SSE42;
    /* Check OXSAVE and AVX bits */
    if( (ecx&0x18000000) == 0x18000000 )
    {
        /* Check for OS support */
        x264_cpu_xgetbv( 0, &eax, &edx );
        if( (eax&0x6) == 0x6 )
            cpu |= X264_CPU_AVX;
    }

    if( cpu & X264_CPU_SSSE3 )
        cpu |= X264_CPU_SSE2_IS_FAST;
    if( cpu & X264_CPU_SSE4 )
        cpu |= X264_CPU_SHUFFLE_IS_FAST;

    x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
    max_extended_cap = eax;

    if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
    {
        cpu |= X264_CPU_SLOW_CTZ;
        x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
        if( edx&0x00400000 )
            cpu |= X264_CPU_MMX2;
        if( cpu & X264_CPU_SSE2 )
        {
            if( ecx&0x00000040 ) /* SSE4a */
            {
                cpu |= X264_CPU_SSE2_IS_FAST;
                cpu |= X264_CPU_LZCNT;
                cpu |= X264_CPU_SHUFFLE_IS_FAST;
                cpu &= ~X264_CPU_SLOW_CTZ;
            }
            else
                cpu |= X264_CPU_SSE2_IS_SLOW;

            if( ecx&0x00000080 ) /* Misalign SSE */
            {
                cpu |= X264_CPU_SSE_MISALIGN;
                x264_cpu_mask_misalign_sse();
            }

            if( cpu & X264_CPU_AVX )
            {
                if( ecx&0x00000800 ) /* XOP */
                    cpu |= X264_CPU_XOP;
                if( ecx&0x00010000 ) /* FMA4 */
                    cpu |= X264_CPU_FMA4;
            }
        }
    }

    if( !strcmp((char*)vendor, "GenuineIntel") )
    {
        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
        if( family == 6 )
        {
            /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
             * theoretically support sse2, but it's significantly slower than mmx for
             * almost all of x264's functions, so let's just pretend they don't. */
            if( model == 9 || model == 13 || model == 14 )
            {
                cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
                assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
            }
            /* Detect Atom CPU */
            else if( model == 28 )
            {
                cpu |= X264_CPU_SLOW_ATOM;
                cpu |= X264_CPU_SLOW_CTZ;
            }
            /* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so
             * detect them here. */
            else if( model >= 23 )
                cpu |= X264_CPU_SHUFFLE_IS_FAST;
        }
    }

    if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
    {
        /* cacheline size is specified in 3 places, any of which may be missing */
        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
        cache = (ebx&0xff00)>>5; // cflush size
        if( !cache && max_extended_cap >= 0x80000006 )
        {
            x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
            cache = ecx&0xff; // cacheline size
        }
        if( !cache )
        {
            // Cache and TLB Information
            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
            uint32_t buf[4];
            int max, i = 0;
            do {
                x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
                max = buf[0]&0xff;
                buf[0] &= ~0xff;
                for( int j = 0; j < 4; j++ )
                    if( !(buf[j]>>31) )
                        while( buf[j] )
                        {
                            if( strchr( cache32_ids, buf[j]&0xff ) )
                                cache = 32;
                            if( strchr( cache64_ids, buf[j]&0xff ) )
                                cache = 64;
                            buf[j] >>= 8;
                        }
            } while( ++i < max );
        }

        if( cache == 32 )
            cpu |= X264_CPU_CACHELINE_32;
        else if( cache == 64 )
            cpu |= X264_CPU_CACHELINE_64;
        else
            x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" );
    }

#if BROKEN_STACK_ALIGNMENT
    cpu |= X264_CPU_STACK_MOD4;
#endif

    return cpu;
}

#elif ARCH_PPC

#if SYS_MACOSX || SYS_OPENBSD
#include <sys/sysctl.h>
uint32_t x264_cpu_detect( void )
{
    /* Thank you VLC */
    uint32_t cpu = 0;
#if SYS_OPENBSD
    int      selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
#else
    int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
#endif
    int      has_altivec = 0;
    size_t   length = sizeof( has_altivec );
    int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );

    if( error == 0 && has_altivec != 0 )
        cpu |= X264_CPU_ALTIVEC;

    return cpu;
}

#elif SYS_LINUX

uint32_t x264_cpu_detect( void )
{
    static void (*oldsig)( int );

    oldsig = signal( SIGILL, sigill_handler );
    if( sigsetjmp( jmpbuf, 1 ) )
    {
        signal( SIGILL, oldsig );
        return 0;
    }

    canjump = 1;
    asm volatile( "mtspr 256, %0\n\t"
                  "vand 0, 0, 0\n\t"
                  :
                  : "r"(-1) );
    canjump = 0;

    signal( SIGILL, oldsig );

    return X264_CPU_ALTIVEC;
}
#endif

#elif ARCH_ARM

void x264_cpu_neon_test( void );
int x264_cpu_fast_neon_mrc_test( void );

uint32_t x264_cpu_detect( void )
{
    int flags = 0;
#if HAVE_ARMV6
    flags |= X264_CPU_ARMV6;

    // don't do this hack if compiled with -mfpu=neon
#if !HAVE_NEON
    static void (* oldsig)( int );
    oldsig = signal( SIGILL, sigill_handler );
    if( sigsetjmp( jmpbuf, 1 ) )
    {
        signal( SIGILL, oldsig );
        return flags;
    }

    canjump = 1;
    x264_cpu_neon_test();
    canjump = 0;
    signal( SIGILL, oldsig );
#endif

    flags |= X264_CPU_NEON;

    // fast neon -> arm (Cortex-A9) detection relies on user access to the
    // cycle counter; this assumes ARMv7 performance counters.
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
    // hopefully this hacky detection method will have been replaced by then.
    // Note that there is potential for a race condition if another program or
    // x264 instance disables or reinits the counters while x264 is using them,
    // which may result in incorrect detection and the counters stuck enabled.
    flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
#endif
    return flags;
}

#else

uint32_t x264_cpu_detect( void )
{
    return 0;
}

#endif

int x264_cpu_num_processors( void )
{
#if !HAVE_THREAD
    return 1;

#elif SYS_WINDOWS
    return x264_pthread_num_processors_np();

#elif SYS_CYGWIN
    return sysconf( _SC_NPROCESSORS_ONLN );

#elif SYS_LINUX
    cpu_set_t p_aff;
    memset( &p_aff, 0, sizeof(p_aff) );
    if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) )
        return 1;
#if HAVE_CPU_COUNT
    return CPU_COUNT(&p_aff);
#else
    int np = 0;
    for( unsigned int bit = 0; bit < 8 * sizeof(p_aff); bit++ )
        np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
    return np;
#endif

#elif SYS_BEOS
    system_info info;
    get_system_info( &info );
    return info.cpu_count;

#elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD
    int ncpu;
    size_t length = sizeof( ncpu );
#if SYS_OPENBSD
    int mib[2] = { CTL_HW, HW_NCPU };
    if( sysctl(mib, 2, &ncpu, &length, NULL, 0) )
#else
    if( sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0) )
#endif
    {
        ncpu = 1;
    }
    return ncpu;

#else
    return 1;
#endif
}
x264-snapshot-20120103-2245-stable/common/common.c0000644000175000001440000013446711700673342020634 0ustar  videolanusers/*****************************************************************************
 * common.c: misc common functions
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

#include <stdarg.h>
#include <ctype.h>

#if HAVE_MALLOC_H
#include <malloc.h>
#endif

const int x264_bit_depth = BIT_DEPTH;

const int x264_chroma_format = X264_CHROMA_FORMAT;

static void x264_log_default( void *, int, const char *, va_list );

/****************************************************************************
 * x264_param_default:
 ****************************************************************************/
void x264_param_default( x264_param_t *param )
{
    /* */
    memset( param, 0, sizeof( x264_param_t ) );

    /* CPU autodetect */
    param->cpu = x264_cpu_detect();
    param->i_threads = X264_THREADS_AUTO;
    param->b_deterministic = 1;
    param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;

    /* Video properties */
    param->i_csp           = X264_CHROMA_FORMAT ? X264_CHROMA_FORMAT : X264_CSP_I420;
    param->i_width         = 0;
    param->i_height        = 0;
    param->vui.i_sar_width = 0;
    param->vui.i_sar_height= 0;
    param->vui.i_overscan  = 0;  /* undef */
    param->vui.i_vidformat = 5;  /* undef */
    param->vui.b_fullrange = -1; /* default depends on input */
    param->vui.i_colorprim = 2;  /* undef */
    param->vui.i_transfer  = 2;  /* undef */
    param->vui.i_colmatrix = -1; /* default depends on input */
    param->vui.i_chroma_loc= 0;  /* left center */
    param->i_fps_num       = 25;
    param->i_fps_den       = 1;
    param->i_level_idc     = -1;
    param->i_slice_max_size = 0;
    param->i_slice_max_mbs = 0;
    param->i_slice_count = 0;

    /* Encoder parameters */
    param->i_frame_reference = 3;
    param->i_keyint_max = 250;
    param->i_keyint_min = X264_KEYINT_MIN_AUTO;
    param->i_bframe = 3;
    param->i_scenecut_threshold = 40;
    param->i_bframe_adaptive = X264_B_ADAPT_FAST;
    param->i_bframe_bias = 0;
    param->i_bframe_pyramid = X264_B_PYRAMID_NORMAL;
    param->b_interlaced = 0;
    param->b_constrained_intra = 0;

    param->b_deblocking_filter = 1;
    param->i_deblocking_filter_alphac0 = 0;
    param->i_deblocking_filter_beta = 0;

    param->b_cabac = 1;
    param->i_cabac_init_idc = 0;

    param->rc.i_rc_method = X264_RC_CRF;
    param->rc.i_bitrate = 0;
    param->rc.f_rate_tolerance = 1.0;
    param->rc.i_vbv_max_bitrate = 0;
    param->rc.i_vbv_buffer_size = 0;
    param->rc.f_vbv_buffer_init = 0.9;
    param->rc.i_qp_constant = 23 + QP_BD_OFFSET;
    param->rc.f_rf_constant = 23;
    param->rc.i_qp_min = 0;
    param->rc.i_qp_max = QP_MAX;
    param->rc.i_qp_step = 4;
    param->rc.f_ip_factor = 1.4;
    param->rc.f_pb_factor = 1.3;
    param->rc.i_aq_mode = X264_AQ_VARIANCE;
    param->rc.f_aq_strength = 1.0;
    param->rc.i_lookahead = 40;

    param->rc.b_stat_write = 0;
    param->rc.psz_stat_out = "x264_2pass.log";
    param->rc.b_stat_read = 0;
    param->rc.psz_stat_in = "x264_2pass.log";
    param->rc.f_qcompress = 0.6;
    param->rc.f_qblur = 0.5;
    param->rc.f_complexity_blur = 20;
    param->rc.i_zones = 0;
    param->rc.b_mb_tree = 1;

    /* Log */
    param->pf_log = x264_log_default;
    param->p_log_private = NULL;
    param->i_log_level = X264_LOG_INFO;

    /* */
    param->analyse.intra = X264_ANALYSE_I4x4 | X264_ANALYSE_I8x8;
    param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_I8x8
                         | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;
    param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
    param->analyse.i_me_method = X264_ME_HEX;
    param->analyse.f_psy_rd = 1.0;
    param->analyse.b_psy = 1;
    param->analyse.f_psy_trellis = 0;
    param->analyse.i_me_range = 16;
    param->analyse.i_subpel_refine = 7;
    param->analyse.b_mixed_references = 1;
    param->analyse.b_chroma_me = 1;
    param->analyse.i_mv_range_thread = -1;
    param->analyse.i_mv_range = -1; // set from level_idc
    param->analyse.i_chroma_qp_offset = 0;
    param->analyse.b_fast_pskip = 1;
    param->analyse.b_weighted_bipred = 1;
    param->analyse.i_weighted_pred = X264_WEIGHTP_SMART;
    param->analyse.b_dct_decimate = 1;
    param->analyse.b_transform_8x8 = 1;
    param->analyse.i_trellis = 1;
    param->analyse.i_luma_deadzone[0] = 21;
    param->analyse.i_luma_deadzone[1] = 11;
    param->analyse.b_psnr = 0;
    param->analyse.b_ssim = 0;

    param->i_cqm_preset = X264_CQM_FLAT;
    memset( param->cqm_4iy, 16, sizeof( param->cqm_4iy ) );
    memset( param->cqm_4py, 16, sizeof( param->cqm_4py ) );
    memset( param->cqm_4ic, 16, sizeof( param->cqm_4ic ) );
    memset( param->cqm_4pc, 16, sizeof( param->cqm_4pc ) );
    memset( param->cqm_8iy, 16, sizeof( param->cqm_8iy ) );
    memset( param->cqm_8py, 16, sizeof( param->cqm_8py ) );
    memset( param->cqm_8ic, 16, sizeof( param->cqm_8ic ) );
    memset( param->cqm_8pc, 16, sizeof( param->cqm_8pc ) );

    param->b_repeat_headers = 1;
    param->b_annexb = 1;
    param->b_aud = 0;
    param->b_vfr_input = 1;
    param->i_nal_hrd = X264_NAL_HRD_NONE;
    param->b_tff = 1;
    param->b_pic_struct = 0;
    param->b_fake_interlaced = 0;
    param->i_frame_packing = -1;
}

static int x264_param_apply_preset( x264_param_t *param, const char *preset )
{
    char *end;
    int i = strtol( preset, &end, 10 );
    if( *end == 0 && i >= 0 && i < sizeof(x264_preset_names)/sizeof(*x264_preset_names)-1 )
        preset = x264_preset_names[i];

    if( !strcasecmp( preset, "ultrafast" ) )
    {
        param->i_frame_reference = 1;
        param->i_scenecut_threshold = 0;
        param->b_deblocking_filter = 0;
        param->b_cabac = 0;
        param->i_bframe = 0;
        param->analyse.intra = 0;
        param->analyse.inter = 0;
        param->analyse.b_transform_8x8 = 0;
        param->analyse.i_me_method = X264_ME_DIA;
        param->analyse.i_subpel_refine = 0;
        param->rc.i_aq_mode = 0;
        param->analyse.b_mixed_references = 0;
        param->analyse.i_trellis = 0;
        param->i_bframe_adaptive = X264_B_ADAPT_NONE;
        param->rc.b_mb_tree = 0;
        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
        param->analyse.b_weighted_bipred = 0;
        param->rc.i_lookahead = 0;
    }
    else if( !strcasecmp( preset, "superfast" ) )
    {
        param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4;
        param->analyse.i_me_method = X264_ME_DIA;
        param->analyse.i_subpel_refine = 1;
        param->i_frame_reference = 1;
        param->analyse.b_mixed_references = 0;
        param->analyse.i_trellis = 0;
        param->rc.b_mb_tree = 0;
        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
        param->rc.i_lookahead = 0;
    }
    else if( !strcasecmp( preset, "veryfast" ) )
    {
        param->analyse.i_me_method = X264_ME_HEX;
        param->analyse.i_subpel_refine = 2;
        param->i_frame_reference = 1;
        param->analyse.b_mixed_references = 0;
        param->analyse.i_trellis = 0;
        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
        param->rc.i_lookahead = 10;
    }
    else if( !strcasecmp( preset, "faster" ) )
    {
        param->analyse.b_mixed_references = 0;
        param->i_frame_reference = 2;
        param->analyse.i_subpel_refine = 4;
        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
        param->rc.i_lookahead = 20;
    }
    else if( !strcasecmp( preset, "fast" ) )
    {
        param->i_frame_reference = 2;
        param->analyse.i_subpel_refine = 6;
        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
        param->rc.i_lookahead = 30;
    }
    else if( !strcasecmp( preset, "medium" ) )
    {
        /* Default is medium */
    }
    else if( !strcasecmp( preset, "slow" ) )
    {
        param->analyse.i_me_method = X264_ME_UMH;
        param->analyse.i_subpel_refine = 8;
        param->i_frame_reference = 5;
        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
        param->rc.i_lookahead = 50;
    }
    else if( !strcasecmp( preset, "slower" ) )
    {
        param->analyse.i_me_method = X264_ME_UMH;
        param->analyse.i_subpel_refine = 9;
        param->i_frame_reference = 8;
        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
        param->analyse.i_trellis = 2;
        param->rc.i_lookahead = 60;
    }
    else if( !strcasecmp( preset, "veryslow" ) )
    {
        param->analyse.i_me_method = X264_ME_UMH;
        param->analyse.i_subpel_refine = 10;
        param->analyse.i_me_range = 24;
        param->i_frame_reference = 16;
        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
        param->analyse.i_trellis = 2;
        param->i_bframe = 8;
        param->rc.i_lookahead = 60;
    }
    else if( !strcasecmp( preset, "placebo" ) )
    {
        param->analyse.i_me_method = X264_ME_TESA;
        param->analyse.i_subpel_refine = 11;
        param->analyse.i_me_range = 24;
        param->i_frame_reference = 16;
        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
        param->analyse.b_fast_pskip = 0;
        param->analyse.i_trellis = 2;
        param->i_bframe = 16;
        param->rc.i_lookahead = 60;
    }
    else
    {
        x264_log( NULL, X264_LOG_ERROR, "invalid preset '%s'\n", preset );
        return -1;
    }
    return 0;
}

static int x264_param_apply_tune( x264_param_t *param, const char *tune )
{
    char *tmp = x264_malloc( strlen( tune ) + 1 );
    if( !tmp )
        return -1;
    tmp = strcpy( tmp, tune );
    char *s = strtok( tmp, ",./-+" );
    int psy_tuning_used = 0;
    while( s )
    {
        if( !strncasecmp( s, "film", 4 ) )
        {
            if( psy_tuning_used++ ) goto psy_failure;
            param->i_deblocking_filter_alphac0 = -1;
            param->i_deblocking_filter_beta = -1;
            param->analyse.f_psy_trellis = 0.15;
        }
        else if( !strncasecmp( s, "animation", 9 ) )
        {
            if( psy_tuning_used++ ) goto psy_failure;
            param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
            param->i_deblocking_filter_alphac0 = 1;
            param->i_deblocking_filter_beta = 1;
            param->analyse.f_psy_rd = 0.4;
            param->rc.f_aq_strength = 0.6;
            param->i_bframe += 2;
        }
        else if( !strncasecmp( s, "grain", 5 ) )
        {
            if( psy_tuning_used++ ) goto psy_failure;
            param->i_deblocking_filter_alphac0 = -2;
            param->i_deblocking_filter_beta = -2;
            param->analyse.f_psy_trellis = 0.25;
            param->analyse.b_dct_decimate = 0;
            param->rc.f_pb_factor = 1.1;
            param->rc.f_ip_factor = 1.1;
            param->rc.f_aq_strength = 0.5;
            param->analyse.i_luma_deadzone[0] = 6;
            param->analyse.i_luma_deadzone[1] = 6;
            param->rc.f_qcompress = 0.8;
        }
        else if( !strncasecmp( s, "stillimage", 5 ) )
        {
            if( psy_tuning_used++ ) goto psy_failure;
            param->i_deblocking_filter_alphac0 = -3;
            param->i_deblocking_filter_beta = -3;
            param->analyse.f_psy_rd = 2.0;
            param->analyse.f_psy_trellis = 0.7;
            param->rc.f_aq_strength = 1.2;
        }
        else if( !strncasecmp( s, "psnr", 4 ) )
        {
            if( psy_tuning_used++ ) goto psy_failure;
            param->rc.i_aq_mode = X264_AQ_NONE;
            param->analyse.b_psy = 0;
        }
        else if( !strncasecmp( s, "ssim", 4 ) )
        {
            if( psy_tuning_used++ ) goto psy_failure;
            param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
            param->analyse.b_psy = 0;
        }
        else if( !strncasecmp( s, "fastdecode", 10 ) )
        {
            param->b_deblocking_filter = 0;
            param->b_cabac = 0;
            param->analyse.b_weighted_bipred = 0;
            param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
        }
        else if( !strncasecmp( s, "zerolatency", 11 ) )
        {
            param->rc.i_lookahead = 0;
            param->i_sync_lookahead = 0;
            param->i_bframe = 0;
            param->b_sliced_threads = 1;
            param->b_vfr_input = 0;
            param->rc.b_mb_tree = 0;
        }
        else if( !strncasecmp( s, "touhou", 6 ) )
        {
            if( psy_tuning_used++ ) goto psy_failure;
            param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
            param->i_deblocking_filter_alphac0 = -1;
            param->i_deblocking_filter_beta = -1;
            param->analyse.f_psy_trellis = 0.2;
            param->rc.f_aq_strength = 1.3;
            if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
        }
        else
        {
            x264_log( NULL, X264_LOG_ERROR, "invalid tune '%s'\n", s );
            x264_free( tmp );
            return -1;
        }
        if( 0 )
        {
    psy_failure:
            x264_log( NULL, X264_LOG_WARNING, "only 1 psy tuning can be used: ignoring tune %s\n", s );
        }
        s = strtok( NULL, ",./-+" );
    }
    x264_free( tmp );
    return 0;
}

int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune )
{
    x264_param_default( param );

    if( preset && x264_param_apply_preset( param, preset ) < 0 )
        return -1;
    if( tune && x264_param_apply_tune( param, tune ) < 0 )
        return -1;
    return 0;
}

void x264_param_apply_fastfirstpass( x264_param_t *param )
{
    /* Set faster options in case of turbo firstpass. */
    if( param->rc.b_stat_write && !param->rc.b_stat_read )
    {
        param->i_frame_reference = 1;
        param->analyse.b_transform_8x8 = 0;
        param->analyse.inter = 0;
        param->analyse.i_me_method = X264_ME_DIA;
        param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine );
        param->analyse.i_trellis = 0;
        param->analyse.b_fast_pskip = 1;
    }
}

static int profile_string_to_int( const char *str )
{
    if( !strcasecmp( str, "baseline" ) )
        return PROFILE_BASELINE;
    if( !strcasecmp( str, "main" ) )
        return PROFILE_MAIN;
    if( !strcasecmp( str, "high" ) )
        return PROFILE_HIGH;
    if( !strcasecmp( str, "high10" ) )
        return PROFILE_HIGH10;
    if( !strcasecmp( str, "high422" ) )
        return PROFILE_HIGH422;
    if( !strcasecmp( str, "high444" ) )
        return PROFILE_HIGH444_PREDICTIVE;
    return -1;
}

int x264_param_apply_profile( x264_param_t *param, const char *profile )
{
    if( !profile )
        return 0;

    int p = profile_string_to_int( profile );
    if( p < 0 )
    {
        x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile );
        return -1;
    }
    if( p < PROFILE_HIGH444_PREDICTIVE && ((param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) ||
        (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0)) )
    {
        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile );
        return -1;
    }
    if( p < PROFILE_HIGH444_PREDICTIVE && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I444 )
    {
        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:4:4\n", profile );
        return -1;
    }
    if( p < PROFILE_HIGH422 && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I422 )
    {
        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:2:2\n", profile );
        return -1;
    }
    if( p < PROFILE_HIGH10 && BIT_DEPTH > 8 )
    {
        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d\n", profile, BIT_DEPTH );
        return -1;
    }

    if( p == PROFILE_BASELINE )
    {
        param->analyse.b_transform_8x8 = 0;
        param->b_cabac = 0;
        param->i_cqm_preset = X264_CQM_FLAT;
        param->psz_cqm_file = NULL;
        param->i_bframe = 0;
        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
        if( param->b_interlaced )
        {
            x264_log( NULL, X264_LOG_ERROR, "baseline profile doesn't support interlacing\n" );
            return -1;
        }
        if( param->b_fake_interlaced )
        {
            x264_log( NULL, X264_LOG_ERROR, "baseline profile doesn't support fake interlacing\n" );
            return -1;
        }
    }
    else if( p == PROFILE_MAIN )
    {
        param->analyse.b_transform_8x8 = 0;
        param->i_cqm_preset = X264_CQM_FLAT;
        param->psz_cqm_file = NULL;
    }
    return 0;
}

static int parse_enum( const char *arg, const char * const *names, int *dst )
{
    for( int i = 0; names[i]; i++ )
        if( !strcmp( arg, names[i] ) )
        {
            *dst = i;
            return 0;
        }
    return -1;
}

static int parse_cqm( const char *str, uint8_t *cqm, int length )
{
    int i = 0;
    do {
        int coef;
        if( !sscanf( str, "%d", &coef ) || coef < 1 || coef > 255 )
            return -1;
        cqm[i++] = coef;
    } while( i < length && (str = strchr( str, ',' )) && str++ );
    return (i == length) ? 0 : -1;
}

static int x264_atobool( const char *str, int *b_error )
{
    if( !strcmp(str, "1") ||
        !strcmp(str, "true") ||
        !strcmp(str, "yes") )
        return 1;
    if( !strcmp(str, "0") ||
        !strcmp(str, "false") ||
        !strcmp(str, "no") )
        return 0;
    *b_error = 1;
    return 0;
}

static int x264_atoi( const char *str, int *b_error )
{
    char *end;
    int v = strtol( str, &end, 0 );
    if( end == str || *end != '\0' )
        *b_error = 1;
    return v;
}

static double x264_atof( const char *str, int *b_error )
{
    char *end;
    double v = strtod( str, &end );
    if( end == str || *end != '\0' )
        *b_error = 1;
    return v;
}

#define atobool(str) ( name_was_bool = 1, x264_atobool( str, &b_error ) )
#define atoi(str) x264_atoi( str, &b_error )
#define atof(str) x264_atof( str, &b_error )

int x264_param_parse( x264_param_t *p, const char *name, const char *value )
{
    char *name_buf = NULL;
    int b_error = 0;
    int name_was_bool;
    int value_was_null = !value;
    int i;

    if( !name )
        return X264_PARAM_BAD_NAME;
    if( !value )
        value = "true";

    if( value[0] == '=' )
        value++;

    if( strchr( name, '_' ) ) // s/_/-/g
    {
        char *c;
        name_buf = strdup(name);
        while( (c = strchr( name_buf, '_' )) )
            *c = '-';
        name = name_buf;
    }

    if( (!strncmp( name, "no-", 3 ) && (i = 3)) ||
        (!strncmp( name, "no", 2 ) && (i = 2)) )
    {
        name += i;
        value = atobool(value) ? "false" : "true";
    }
    name_was_bool = 0;

#define OPT(STR) else if( !strcmp( name, STR ) )
#define OPT2(STR0, STR1) else if( !strcmp( name, STR0 ) || !strcmp( name, STR1 ) )
    if(0);
    OPT("asm")
    {
        p->cpu = isdigit(value[0]) ? atoi(value) :
                 !strcmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0;
        if( b_error )
        {
            char *buf = strdup(value);
            char *tok, UNUSED *saveptr=NULL, *init;
            b_error = 0;
            p->cpu = 0;
            for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
            {
                for( i=0; x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name); i++ );
                p->cpu |= x264_cpu_names[i].flags;
                if( !x264_cpu_names[i].flags )
                    b_error = 1;
            }
            free( buf );
            if( p->cpu & X264_CPU_SSSE3 )
                p->cpu |= X264_CPU_SSE2_IS_FAST;
            if( p->cpu & X264_CPU_SSE4 )
                p->cpu |= X264_CPU_SHUFFLE_IS_FAST;
        }
    }
    OPT("threads")
    {
        if( !strcmp(value, "auto") )
            p->i_threads = X264_THREADS_AUTO;
        else
            p->i_threads = atoi(value);
    }
    OPT("sliced-threads")
        p->b_sliced_threads = atobool(value);
    OPT("sync-lookahead")
    {
        if( !strcmp(value, "auto") )
            p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
        else
            p->i_sync_lookahead = atoi(value);
    }
    OPT2("deterministic", "n-deterministic")
        p->b_deterministic = atobool(value);
    OPT("cpu-independent")
        p->b_cpu_independent = atobool(value);
    OPT2("level", "level-idc")
    {
        if( !strcmp(value, "1b") )
            p->i_level_idc = 9;
        else if( atof(value) < 6 )
            p->i_level_idc = (int)(10*atof(value)+.5);
        else
            p->i_level_idc = atoi(value);
    }
    OPT("bluray-compat")
        p->b_bluray_compat = atobool(value);
    OPT("sar")
    {
        b_error = ( 2 != sscanf( value, "%d:%d", &p->vui.i_sar_width, &p->vui.i_sar_height ) &&
                    2 != sscanf( value, "%d/%d", &p->vui.i_sar_width, &p->vui.i_sar_height ) );
    }
    OPT("overscan")
        b_error |= parse_enum( value, x264_overscan_names, &p->vui.i_overscan );
    OPT("videoformat")
        b_error |= parse_enum( value, x264_vidformat_names, &p->vui.i_vidformat );
    OPT("fullrange")
        b_error |= parse_enum( value, x264_fullrange_names, &p->vui.b_fullrange );
    OPT("colorprim")
        b_error |= parse_enum( value, x264_colorprim_names, &p->vui.i_colorprim );
    OPT("transfer")
        b_error |= parse_enum( value, x264_transfer_names, &p->vui.i_transfer );
    OPT("colormatrix")
        b_error |= parse_enum( value, x264_colmatrix_names, &p->vui.i_colmatrix );
    OPT("chromaloc")
    {
        p->vui.i_chroma_loc = atoi(value);
        b_error = ( p->vui.i_chroma_loc < 0 || p->vui.i_chroma_loc > 5 );
    }
    OPT("fps")
    {
        if( sscanf( value, "%u/%u", &p->i_fps_num, &p->i_fps_den ) == 2 )
            ;
        else
        {
            float fps = atof(value);
            p->i_fps_num = (int)(fps * 1000 + .5);
            p->i_fps_den = 1000;
        }
    }
    OPT2("ref", "frameref")
        p->i_frame_reference = atoi(value);
    OPT("dpb-size")
        p->i_dpb_size = atoi(value);
    OPT("keyint")
    {
        if( strstr( value, "infinite" ) )
            p->i_keyint_max = X264_KEYINT_MAX_INFINITE;
        else
            p->i_keyint_max = atoi(value);
    }
    OPT2("min-keyint", "keyint-min")
    {
        p->i_keyint_min = atoi(value);
        if( p->i_keyint_max < p->i_keyint_min )
            p->i_keyint_max = p->i_keyint_min;
    }
    OPT("scenecut")
    {
        p->i_scenecut_threshold = atobool(value);
        if( b_error || p->i_scenecut_threshold )
        {
            b_error = 0;
            p->i_scenecut_threshold = atoi(value);
        }
    }
    OPT("intra-refresh")
        p->b_intra_refresh = atobool(value);
    OPT("bframes")
        p->i_bframe = atoi(value);
    OPT("b-adapt")
    {
        p->i_bframe_adaptive = atobool(value);
        if( b_error )
        {
            b_error = 0;
            p->i_bframe_adaptive = atoi(value);
        }
    }
    OPT("b-bias")
        p->i_bframe_bias = atoi(value);
    OPT("b-pyramid")
    {
        b_error |= parse_enum( value, x264_b_pyramid_names, &p->i_bframe_pyramid );
        if( b_error )
        {
            b_error = 0;
            p->i_bframe_pyramid = atoi(value);
        }
    }
    OPT("open-gop")
        p->b_open_gop = atobool(value);
    OPT("nf")
        p->b_deblocking_filter = !atobool(value);
    OPT2("filter", "deblock")
    {
        if( 2 == sscanf( value, "%d:%d", &p->i_deblocking_filter_alphac0, &p->i_deblocking_filter_beta ) ||
            2 == sscanf( value, "%d,%d", &p->i_deblocking_filter_alphac0, &p->i_deblocking_filter_beta ) )
        {
            p->b_deblocking_filter = 1;
        }
        else if( sscanf( value, "%d", &p->i_deblocking_filter_alphac0 ) )
        {
            p->b_deblocking_filter = 1;
            p->i_deblocking_filter_beta = p->i_deblocking_filter_alphac0;
        }
        else
            p->b_deblocking_filter = atobool(value);
    }
    OPT("slice-max-size")
        p->i_slice_max_size = atoi(value);
    OPT("slice-max-mbs")
        p->i_slice_max_mbs = atoi(value);
    OPT("slices")
        p->i_slice_count = atoi(value);
    OPT("cabac")
        p->b_cabac = atobool(value);
    OPT("cabac-idc")
        p->i_cabac_init_idc = atoi(value);
    OPT("interlaced")
        p->b_interlaced = atobool(value);
    OPT("tff")
        p->b_interlaced = p->b_tff = atobool(value);
    OPT("bff")
    {
        p->b_interlaced = atobool(value);
        p->b_tff = !p->b_interlaced;
    }
    OPT("constrained-intra")
        p->b_constrained_intra = atobool(value);
    OPT("cqm")
    {
        if( strstr( value, "flat" ) )
            p->i_cqm_preset = X264_CQM_FLAT;
        else if( strstr( value, "jvt" ) )
            p->i_cqm_preset = X264_CQM_JVT;
        else
            p->psz_cqm_file = strdup(value);
    }
    OPT("cqmfile")
        p->psz_cqm_file = strdup(value);
    OPT("cqm4")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_4iy, 16 );
        b_error |= parse_cqm( value, p->cqm_4py, 16 );
        b_error |= parse_cqm( value, p->cqm_4ic, 16 );
        b_error |= parse_cqm( value, p->cqm_4pc, 16 );
    }
    OPT("cqm8")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_8iy, 64 );
        b_error |= parse_cqm( value, p->cqm_8py, 64 );
        b_error |= parse_cqm( value, p->cqm_8ic, 64 );
        b_error |= parse_cqm( value, p->cqm_8pc, 64 );
    }
    OPT("cqm4i")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_4iy, 16 );
        b_error |= parse_cqm( value, p->cqm_4ic, 16 );
    }
    OPT("cqm4p")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_4py, 16 );
        b_error |= parse_cqm( value, p->cqm_4pc, 16 );
    }
    OPT("cqm4iy")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_4iy, 16 );
    }
    OPT("cqm4ic")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_4ic, 16 );
    }
    OPT("cqm4py")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_4py, 16 );
    }
    OPT("cqm4pc")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_4pc, 16 );
    }
    OPT("cqm8i")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_8iy, 64 );
        b_error |= parse_cqm( value, p->cqm_8ic, 64 );
    }
    OPT("cqm8p")
    {
        p->i_cqm_preset = X264_CQM_CUSTOM;
        b_error |= parse_cqm( value, p->cqm_8py, 64 );
        b_error |= parse_cqm( value, p->cqm_8pc, 64 );
    }
    OPT("log")
        p->i_log_level = atoi(value);
#if HAVE_VISUALIZE
    OPT("visualize")
        p->b_visualize = atobool(value);
#endif
    OPT("dump-yuv")
        p->psz_dump_yuv = strdup(value);
    OPT2("analyse", "partitions")
    {
        p->analyse.inter = 0;
        if( strstr( value, "none" ) )  p->analyse.inter =  0;
        if( strstr( value, "all" ) )   p->analyse.inter = ~0;

        if( strstr( value, "i4x4" ) )  p->analyse.inter |= X264_ANALYSE_I4x4;
        if( strstr( value, "i8x8" ) )  p->analyse.inter |= X264_ANALYSE_I8x8;
        if( strstr( value, "p8x8" ) )  p->analyse.inter |= X264_ANALYSE_PSUB16x16;
        if( strstr( value, "p4x4" ) )  p->analyse.inter |= X264_ANALYSE_PSUB8x8;
        if( strstr( value, "b8x8" ) )  p->analyse.inter |= X264_ANALYSE_BSUB16x16;
    }
    OPT("8x8dct")
        p->analyse.b_transform_8x8 = atobool(value);
    OPT2("weightb", "weight-b")
        p->analyse.b_weighted_bipred = atobool(value);
    OPT("weightp")
        p->analyse.i_weighted_pred = atoi(value);
    OPT2("direct", "direct-pred")
        b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred );
    OPT("chroma-qp-offset")
        p->analyse.i_chroma_qp_offset = atoi(value);
    OPT("me")
        b_error |= parse_enum( value, x264_motion_est_names, &p->analyse.i_me_method );
    OPT2("merange", "me-range")
        p->analyse.i_me_range = atoi(value);
    OPT2("mvrange", "mv-range")
        p->analyse.i_mv_range = atoi(value);
    OPT2("mvrange-thread", "mv-range-thread")
        p->analyse.i_mv_range_thread = atoi(value);
    OPT2("subme", "subq")
        p->analyse.i_subpel_refine = atoi(value);
    OPT("psy-rd")
    {
        if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
            2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
            2 == sscanf( value, "%f|%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ))
        { }
        else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
        {
            p->analyse.f_psy_trellis = 0;
        }
        else
        {
            p->analyse.f_psy_rd = 0;
            p->analyse.f_psy_trellis = 0;
        }
    }
    OPT("psy")
        p->analyse.b_psy = atobool(value);
    OPT("chroma-me")
        p->analyse.b_chroma_me = atobool(value);
    OPT("mixed-refs")
        p->analyse.b_mixed_references = atobool(value);
    OPT("trellis")
        p->analyse.i_trellis = atoi(value);
    OPT("fast-pskip")
        p->analyse.b_fast_pskip = atobool(value);
    OPT("dct-decimate")
        p->analyse.b_dct_decimate = atobool(value);
    OPT("deadzone-inter")
        p->analyse.i_luma_deadzone[0] = atoi(value);
    OPT("deadzone-intra")
        p->analyse.i_luma_deadzone[1] = atoi(value);
    OPT("nr")
        p->analyse.i_noise_reduction = atoi(value);
    OPT("bitrate")
    {
        p->rc.i_bitrate = atoi(value);
        p->rc.i_rc_method = X264_RC_ABR;
    }
    OPT2("qp", "qp_constant")
    {
        p->rc.i_qp_constant = atoi(value);
        p->rc.i_rc_method = X264_RC_CQP;
    }
    OPT("crf")
    {
        p->rc.f_rf_constant = atof(value);
        p->rc.i_rc_method = X264_RC_CRF;
    }
    OPT("crf-max")
        p->rc.f_rf_constant_max = atof(value);
    OPT("rc-lookahead")
        p->rc.i_lookahead = atoi(value);
    OPT2("qpmin", "qp-min")
        p->rc.i_qp_min = atoi(value);
    OPT2("qpmax", "qp-max")
        p->rc.i_qp_max = atoi(value);
    OPT2("qpstep", "qp-step")
        p->rc.i_qp_step = atoi(value);
    OPT("ratetol")
        p->rc.f_rate_tolerance = !strncmp("inf", value, 3) ? 1e9 : atof(value);
    OPT("vbv-maxrate")
        p->rc.i_vbv_max_bitrate = atoi(value);
    OPT("vbv-bufsize")
        p->rc.i_vbv_buffer_size = atoi(value);
    OPT("vbv-init")
        p->rc.f_vbv_buffer_init = atof(value);
    OPT2("ipratio", "ip-factor")
        p->rc.f_ip_factor = atof(value);
    OPT2("pbratio", "pb-factor")
        p->rc.f_pb_factor = atof(value);
    OPT("aq-mode")
        p->rc.i_aq_mode = atoi(value);
    OPT("aq-strength")
        p->rc.f_aq_strength = atof(value);
    OPT("pass")
    {
        int pass = x264_clip3( atoi(value), 0, 3 );
        p->rc.b_stat_write = pass & 1;
        p->rc.b_stat_read = pass & 2;
    }
    OPT("stats")
    {
        p->rc.psz_stat_in = strdup(value);
        p->rc.psz_stat_out = strdup(value);
    }
    OPT("qcomp")
        p->rc.f_qcompress = atof(value);
    OPT("mbtree")
        p->rc.b_mb_tree = atobool(value);
    OPT("qblur")
        p->rc.f_qblur = atof(value);
    OPT2("cplxblur", "cplx-blur")
        p->rc.f_complexity_blur = atof(value);
    OPT("zones")
        p->rc.psz_zones = strdup(value);
    OPT("crop-rect")
        b_error |= sscanf( value, "%u,%u,%u,%u", &p->crop_rect.i_left, &p->crop_rect.i_top,
                                                 &p->crop_rect.i_right, &p->crop_rect.i_bottom ) != 4;
    OPT("psnr")
        p->analyse.b_psnr = atobool(value);
    OPT("ssim")
        p->analyse.b_ssim = atobool(value);
    OPT("aud")
        p->b_aud = atobool(value);
    OPT("sps-id")
        p->i_sps_id = atoi(value);
    OPT("global-header")
        p->b_repeat_headers = !atobool(value);
    OPT("repeat-headers")
        p->b_repeat_headers = atobool(value);
    OPT("annexb")
        p->b_annexb = atobool(value);
    OPT("force-cfr")
        p->b_vfr_input = !atobool(value);
    OPT("nal-hrd")
        b_error |= parse_enum( value, x264_nal_hrd_names, &p->i_nal_hrd );
    OPT("pic-struct")
        p->b_pic_struct = atobool(value);
    OPT("fake-interlaced")
        p->b_fake_interlaced = atobool(value);
    OPT("frame-packing")
        p->i_frame_packing = atoi(value);
    else
        return X264_PARAM_BAD_NAME;
#undef OPT
#undef OPT2
#undef atobool
#undef atoi
#undef atof

    if( name_buf )
        free( name_buf );

    b_error |= value_was_null && !name_was_bool;
    return b_error ? X264_PARAM_BAD_VALUE : 0;
}

/****************************************************************************
 * x264_log:
 ****************************************************************************/
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
{
    if( !h || i_level <= h->param.i_log_level )
    {
        va_list arg;
        va_start( arg, psz_fmt );
        if( !h )
            x264_log_default( NULL, i_level, psz_fmt, arg );
        else
            h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
        va_end( arg );
    }
}

static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt, va_list arg )
{
    char *psz_prefix;
    switch( i_level )
    {
        case X264_LOG_ERROR:
            psz_prefix = "error";
            break;
        case X264_LOG_WARNING:
            psz_prefix = "warning";
            break;
        case X264_LOG_INFO:
            psz_prefix = "info";
            break;
        case X264_LOG_DEBUG:
            psz_prefix = "debug";
            break;
        default:
            psz_prefix = "unknown";
            break;
    }
    fprintf( stderr, "x264 [%s]: ", psz_prefix );
    vfprintf( stderr, psz_fmt, arg );
}

/****************************************************************************
 * x264_picture_init:
 ****************************************************************************/
void x264_picture_init( x264_picture_t *pic )
{
    memset( pic, 0, sizeof( x264_picture_t ) );
    pic->i_type = X264_TYPE_AUTO;
    pic->i_qpplus1 = X264_QP_AUTO;
    pic->i_pic_struct = PIC_STRUCT_AUTO;
}

/****************************************************************************
 * x264_picture_alloc:
 ****************************************************************************/
int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
{
    typedef struct
    {
        int planes;
        int width_fix8[3];
        int height_fix8[3];
    } x264_csp_tab_t;

    static const x264_csp_tab_t x264_csp_tab[] =
    {
        [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
        [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
        [X264_CSP_NV12] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
        [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
        [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
        [X264_CSP_NV16] = { 2, { 256*1, 256*1 },        { 256*1, 256*1 },       },
        [X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
        [X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
        [X264_CSP_BGR]  = { 1, { 256*3 },               { 256*1 },              },
        [X264_CSP_BGRA] = { 1, { 256*4 },               { 256*1 },              },
        [X264_CSP_RGB]  = { 1, { 256*3 },               { 256*1 },              },
    };

    int csp = i_csp & X264_CSP_MASK;
    if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX )
        return -1;
    x264_picture_init( pic );
    pic->img.i_csp = i_csp;
    pic->img.i_plane = x264_csp_tab[csp].planes;
    int depth_factor = i_csp & X264_CSP_HIGH_DEPTH ? 2 : 1;
    int plane_offset[3] = {0};
    int frame_size = 0;
    for( int i = 0; i < pic->img.i_plane; i++ )
    {
        int stride = (((int64_t)i_width * x264_csp_tab[csp].width_fix8[i]) >> 8) * depth_factor;
        int plane_size = (((int64_t)i_height * x264_csp_tab[csp].height_fix8[i]) >> 8) * stride;
        pic->img.i_stride[i] = stride;
        plane_offset[i] = frame_size;
        frame_size += plane_size;
    }
    pic->img.plane[0] = x264_malloc( frame_size );
    if( !pic->img.plane[0] )
        return -1;
    for( int i = 1; i < pic->img.i_plane; i++ )
        pic->img.plane[i] = pic->img.plane[0] + plane_offset[i];
    return 0;
}

/****************************************************************************
 * x264_picture_clean:
 ****************************************************************************/
void x264_picture_clean( x264_picture_t *pic )
{
    x264_free( pic->img.plane[0] );

    /* just to be safe */
    memset( pic, 0, sizeof( x264_picture_t ) );
}

/****************************************************************************
 * x264_malloc:
 ****************************************************************************/
void *x264_malloc( int i_size )
{
    uint8_t *align_buf = NULL;
#if SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
    /* Mac OS X and Win x64 always returns 16 byte aligned memory */
    align_buf = malloc( i_size );
#elif HAVE_MALLOC_H
    align_buf = memalign( 16, i_size );
#else
    uint8_t *buf = malloc( i_size + 15 + sizeof(void **) );
    if( buf )
    {
        align_buf = buf + 15 + sizeof(void **);
        align_buf -= (intptr_t) align_buf & 15;
        *( (void **) ( align_buf - sizeof(void **) ) ) = buf;
    }
#endif
    if( !align_buf )
        x264_log( NULL, X264_LOG_ERROR, "malloc of size %d failed\n", i_size );
    return align_buf;
}

/****************************************************************************
 * x264_free:
 ****************************************************************************/
void x264_free( void *p )
{
    if( p )
    {
#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
        free( p );
#else
        free( *( ( ( void **) p ) - 1 ) );
#endif
    }
}

/****************************************************************************
 * x264_reduce_fraction:
 ****************************************************************************/
#define REDUCE_FRACTION( name, type )\
void name( type *n, type *d )\
{                   \
    type a = *n;    \
    type b = *d;    \
    type c;         \
    if( !a || !b )  \
        return;     \
    c = a % b;      \
    while( c )      \
    {               \
        a = b;      \
        b = c;      \
        c = a % b;  \
    }               \
    *n /= b;        \
    *d /= b;        \
}

REDUCE_FRACTION( x264_reduce_fraction  , uint32_t )
REDUCE_FRACTION( x264_reduce_fraction64, uint64_t )

/****************************************************************************
 * x264_slurp_file:
 ****************************************************************************/
char *x264_slurp_file( const char *filename )
{
    int b_error = 0;
    size_t i_size;
    char *buf;
    FILE *fh = fopen( filename, "rb" );
    if( !fh )
        return NULL;
    b_error |= fseek( fh, 0, SEEK_END ) < 0;
    b_error |= ( i_size = ftell( fh ) ) <= 0;
    b_error |= fseek( fh, 0, SEEK_SET ) < 0;
    if( b_error )
        goto error;
    buf = x264_malloc( i_size+2 );
    if( !buf )
        goto error;
    b_error |= fread( buf, 1, i_size, fh ) != i_size;
    if( buf[i_size-1] != '\n' )
        buf[i_size++] = '\n';
    buf[i_size] = 0;
    fclose( fh );
    if( b_error )
    {
        x264_free( buf );
        return NULL;
    }
    return buf;
error:
    fclose( fh );
    return NULL;
}

/****************************************************************************
 * x264_param2string:
 ****************************************************************************/
char *x264_param2string( x264_param_t *p, int b_res )
{
    int len = 1000;
    char *buf, *s;
    if( p->rc.psz_zones )
        len += strlen(p->rc.psz_zones);
    buf = s = x264_malloc( len );
    if( !buf )
        return NULL;

    if( b_res )
    {
        s += sprintf( s, "%dx%d ", p->i_width, p->i_height );
        s += sprintf( s, "fps=%u/%u ", p->i_fps_num, p->i_fps_den );
        s += sprintf( s, "timebase=%u/%u ", p->i_timebase_num, p->i_timebase_den );
        s += sprintf( s, "bitdepth=%d ", BIT_DEPTH );
    }

    s += sprintf( s, "cabac=%d", p->b_cabac );
    s += sprintf( s, " ref=%d", p->i_frame_reference );
    s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter,
                  p->i_deblocking_filter_alphac0, p->i_deblocking_filter_beta );
    s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
    s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
    s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
    s += sprintf( s, " psy=%d", p->analyse.b_psy );
    if( p->analyse.b_psy )
        s += sprintf( s, " psy_rd=%.2f:%.2f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
    s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
    s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
    s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
    s += sprintf( s, " trellis=%d", p->analyse.i_trellis );
    s += sprintf( s, " 8x8dct=%d", p->analyse.b_transform_8x8 );
    s += sprintf( s, " cqm=%d", p->i_cqm_preset );
    s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
    s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip );
    s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
    s += sprintf( s, " threads=%d", p->i_threads );
    s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
    if( p->i_slice_count )
        s += sprintf( s, " slices=%d", p->i_slice_count );
    if( p->i_slice_max_size )
        s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size );
    if( p->i_slice_max_mbs )
        s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs );
    s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
    s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
    s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" );
    s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat );

    s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra );

    s += sprintf( s, " bframes=%d", p->i_bframe );
    if( p->i_bframe )
    {
        s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d weightb=%d open_gop=%d",
                      p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
                      p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->b_open_gop );
    }
    s += sprintf( s, " weightp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );

    if( p->i_keyint_max == X264_KEYINT_MAX_INFINITE )
        s += sprintf( s, " keyint=infinite" );
    else
        s += sprintf( s, " keyint=%d", p->i_keyint_max );
    s += sprintf( s, " keyint_min=%d scenecut=%d intra_refresh=%d",
                  p->i_keyint_min, p->i_scenecut_threshold, p->b_intra_refresh );

    if( p->rc.b_mb_tree || p->rc.i_vbv_buffer_size )
        s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead );

    s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ?
                               ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_max_bitrate == p->rc.i_bitrate ? "cbr" : "abr" )
                               : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree );
    if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF )
    {
        if( p->rc.i_rc_method == X264_RC_CRF )
            s += sprintf( s, " crf=%.1f", p->rc.f_rf_constant );
        else
            s += sprintf( s, " bitrate=%d ratetol=%.1f",
                          p->rc.i_bitrate, p->rc.f_rate_tolerance );
        s += sprintf( s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d",
                      p->rc.f_qcompress, p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step );
        if( p->rc.b_stat_read )
            s += sprintf( s, " cplxblur=%.1f qblur=%.1f",
                          p->rc.f_complexity_blur, p->rc.f_qblur );
        if( p->rc.i_vbv_buffer_size )
        {
            s += sprintf( s, " vbv_maxrate=%d vbv_bufsize=%d",
                          p->rc.i_vbv_max_bitrate, p->rc.i_vbv_buffer_size );
            if( p->rc.i_rc_method == X264_RC_CRF )
                s += sprintf( s, " crf_max=%.1f", p->rc.f_rf_constant_max );
        }
    }
    else if( p->rc.i_rc_method == X264_RC_CQP )
        s += sprintf( s, " qp=%d", p->rc.i_qp_constant );

    if( p->rc.i_vbv_buffer_size )
        s += sprintf( s, " nal_hrd=%s", x264_nal_hrd_names[p->i_nal_hrd] );
    if( p->crop_rect.i_left | p->crop_rect.i_top | p->crop_rect.i_right | p->crop_rect.i_bottom )
        s += sprintf( s, " crop_rect=%u,%u,%u,%u", p->crop_rect.i_left, p->crop_rect.i_top,
                                                   p->crop_rect.i_right, p->crop_rect.i_bottom );
    if( p->i_frame_packing >= 0 )
        s += sprintf( s, " frame-packing=%d", p->i_frame_packing );

    if( !(p->rc.i_rc_method == X264_RC_CQP && p->rc.i_qp_constant == 0) )
    {
        s += sprintf( s, " ip_ratio=%.2f", p->rc.f_ip_factor );
        if( p->i_bframe && !p->rc.b_mb_tree )
            s += sprintf( s, " pb_ratio=%.2f", p->rc.f_pb_factor );
        s += sprintf( s, " aq=%d", p->rc.i_aq_mode );
        if( p->rc.i_aq_mode )
            s += sprintf( s, ":%.2f", p->rc.f_aq_strength );
        if( p->rc.psz_zones )
            s += sprintf( s, " zones=%s", p->rc.psz_zones );
        else if( p->rc.i_zones )
            s += sprintf( s, " zones" );
    }

    return buf;
}

x264-snapshot-20120103-2245-stable/common/cabac.h0000644000175000001440000001015211700673342020362 0ustar  videolanusers/*****************************************************************************
 * cabac.h: arithmetic coder
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_CABAC_H
#define X264_CABAC_H

typedef struct
{
    /* state */
    int i_low;
    int i_range;

    /* bit stream */
    int i_queue; //stored with an offset of -8 for faster asm
    int i_bytes_outstanding;

    uint8_t *p_start;
    uint8_t *p;
    uint8_t *p_end;

    /* aligned for memcpy_aligned starting here */
    ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()

    /* context */
    uint8_t state[1024];

    /* for 16-byte alignment */
    uint8_t padding[12];
} x264_cabac_t;

extern const uint8_t x264_cabac_transition[128][2];
extern const uint16_t x264_cabac_entropy[128];

/* init the contexts given i_slice_type, the quantif and the model */
void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );

void x264_cabac_encode_init_core( x264_cabac_t *cb );
void x264_cabac_encode_init ( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b );
void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b );
void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b );
void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b );
void x264_cabac_encode_terminal_c( x264_cabac_t *cb );
void x264_cabac_encode_terminal_asm( x264_cabac_t *cb );
void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val );
void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );

#if HAVE_MMX
#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
#else
#define x264_cabac_encode_decision x264_cabac_encode_decision_c
#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c
#endif
#define x264_cabac_encode_decision_noup x264_cabac_encode_decision

static ALWAYS_INLINE int x264_cabac_pos( x264_cabac_t *cb )
{
    return (cb->p - cb->p_start + cb->i_bytes_outstanding) * 8 + cb->i_queue;
}

/* internal only. these don't write the bitstream, just calculate bit cost: */

static ALWAYS_INLINE void x264_cabac_size_decision( x264_cabac_t *cb, long i_ctx, long b )
{
    int i_state = cb->state[i_ctx];
    cb->state[i_ctx] = x264_cabac_transition[i_state][b];
    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
}

static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b )
{
    int i_state = *state;
    *state = x264_cabac_transition[i_state][b];
    return x264_cabac_entropy[i_state^b];
}

static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b )
{
    int i_state = cb->state[i_ctx];
    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
}

static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b )
{
    return x264_cabac_entropy[*state^b];
}

#endif
x264-snapshot-20120103-2245-stable/common/cabac.c0000644000175000001440000023054511700673342020367 0ustar  videolanusers/*****************************************************************************
 * cabac.c: arithmetic coder
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"


static const int8_t x264_cabac_context_init_I[1024][2] =
{
    /* 0 - 10 */
    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
    { -6,  53 }, { -1, 54 },  {  7,  51 },

    /* 11 - 23 unused for I */
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },

    /* 24- 39 */
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },

    /* 40 - 53 */
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },

    /* 54 - 59 */
    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
    { 0, 0 },    { 0, 0 },

    /* 60 - 69 */
    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
    { 13, 41 },  { 3, 62 },

    /* 70 -> 87 */
    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
    { -12, 115 },{ -16, 122 },

    /* 88 -> 104 */
    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
    { -22, 125 },

    /* 105 -> 135 */
    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
    { 14, 62 },  { -13, 108 },{ -15, 100 },

    /* 136 -> 165 */
    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
    { 0, 62 },   { 12, 72 },

    /* 166 -> 196 */
    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
    { 0, 89 },   { 26, -19 }, { 22, -17 },

    /* 197 -> 226 */
    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
    { 12, 68 },  { 2, 97 },

    /* 227 -> 251 */
    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
    { -4, 65 },

    /* 252 -> 275 */
    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },

    /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
    { 0, 0 },

    /* 277 -> 307 */
    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
    { 9, 64 },   { -12, 104 },{ -11, 97 },

    /* 308 -> 337 */
    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
    { 5, 64 },   { 12, 70 },

    /* 338 -> 368 */
    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
    { -12, 109 },{ 36, -35 }, { 36, -34 },

    /* 369 -> 398 */
    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
    { 29, 39 },  { 19, 66 },

    /* 399 -> 435 */
    {  31,  21 }, {  31,  31 }, {  25,  50 },
    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
    {   0,  68 }, {  -9,  92 },

    /* 436 -> 459 */
    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },

    /* 460 -> 1024 */
    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
    { -23,  68 }, { -24,  50 }, { -11,  74 }, { -14, 106 },
    { -13,  97 }, { -15,  90 }, { -12,  90 }, { -18,  88 },
    { -10,  73 }, {  -9,  79 }, { -14,  86 }, { -10,  73 },
    { -10,  70 }, { -10,  69 }, {  -5,  66 }, {  -9,  64 },
    {  -5,  58 }, {   2,  59 }, {  23, -13 }, {  26, -13 },
    {  40, -15 }, {  49, -14 }, {  44,   3 }, {  45,   6 },
    {  44,  34 }, {  33,  54 }, {  19,  82 }, {  21, -10 },
    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
    {   0,  68 }, {  -9,  92 }, { -17, 120 }, { -20, 112 },
    { -18, 114 }, { -11,  85 }, { -15,  92 }, { -14,  89 },
    { -26,  71 }, { -15,  81 }, { -14,  80 }, {   0,  68 },
    { -14,  70 }, { -24,  56 }, { -23,  68 }, { -24,  50 },
    { -11,  74 }, { -14, 106 }, { -13,  97 }, { -15,  90 },
    { -12,  90 }, { -18,  88 }, { -10,  73 }, {  -9,  79 },
    { -14,  86 }, { -10,  73 }, { -10,  70 }, { -10,  69 },
    {  -5,  66 }, {  -9,  64 }, {  -5,  58 }, {   2,  59 },
    {  23, -13 }, {  26, -13 }, {  40, -15 }, {  49, -14 },
    {  44,   3 }, {  45,   6 }, {  44,  34 }, {  33,  54 },
    {  19,  82 }, {  21, -10 }, {  24, -11 }, {  28,  -8 },
    {  28,  -1 }, {  29,   3 }, {  29,   9 }, {  35,  20 },
    {  29,  36 }, {  14,  67 }, {  -3,  75 }, {  -1,  23 },
    {   1,  34 }, {   1,  43 }, {   0,  54 }, {  -2,  55 },
    {   0,  61 }, {   1,  64 }, {   0,  68 }, {  -9,  92 },
    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
    {  -3,  71 }, {  -6,  42 }, {  -5,  50 }, {  -3,  54 },
    {  -2,  62 }, {   0,  58 }, {   1,  63 }, {  -2,  72 },
    {  -1,  74 }, {  -9,  91 }, {  -5,  67 }, {  -5,  27 },
    {  -3,  39 }, {  -2,  44 }, {   0,  46 }, { -16,  64 },
    {  -8,  68 }, { -10,  78 }, {  -6,  77 }, { -10,  86 },
    { -12,  92 }, { -15,  55 }, { -10,  60 }, {  -6,  62 },
    {  -4,  65 }, { -12,  73 }, {  -8,  76 }, {  -7,  80 },
    {  -9,  88 }, { -17, 110 }, {  -3,  71 }, {  -6,  42 },
    {  -5,  50 }, {  -3,  54 }, {  -2,  62 }, {   0,  58 },
    {   1,  63 }, {  -2,  72 }, {  -1,  74 }, {  -9,  91 },
    {  -5,  67 }, {  -5,  27 }, {  -3,  39 }, {  -2,  44 },
    {   0,  46 }, { -16,  64 }, {  -8,  68 }, { -10,  78 },
    {  -6,  77 }, { -10,  86 }, { -12,  92 }, { -15,  55 },
    { -10,  60 }, {  -6,  62 }, {  -4,  65 }, { -12,  73 },
    {  -8,  76 }, {  -7,  80 }, {  -9,  88 }, { -17, 110 },
    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 }
};

static const int8_t x264_cabac_context_init_PB[3][1024][2] =
{
    /* i_cabac_init_idc == 0 */
    {
        /* 0 - 10 */
        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
        {  -6,  53 }, {  -1,  54 }, {   7,  51 },

        /* 11 - 23 */
        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
        {  17,  50 },

        /* 24 - 39 */
        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },

        /* 40 - 53 */
        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
        {  -3,  81 }, {   0,  88 },

        /* 54 - 59 */
        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
        {  -7,  72 }, {   1,  58 },

        /* 60 - 69 */
        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
        {  13,  41 }, {   3,  62 },

        /* 70 - 87 */
        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
        {   0,  68 }, {  -4,  69 }, {  -8,  88 },

        /* 105 -> 165 */
        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
        {   9,  69 },

        /* 166 - 226 */
        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
        {  -9, 108 },

        /* 227 - 275 */
        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
        {  -8,  85 },

        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
        { 0, 0 },

        /* 277 - 337 */
        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
        {  26,  43 },

        /* 338 - 398 */
        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
        {  11,  86 },

        /* 399 -> 435 */
        {  12,  40 }, {  11,  51 }, {  14,  59 },
        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
        {  -8,  66 }, {  -8,  76 },

        /* 436 -> 459 */
        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },

        /* 460 - 1024 */
        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
        { -16,  66 }, { -22,  65 }, { -20,  63 }, {  -5,  85 },
        {  -6,  81 }, { -10,  77 }, {  -7,  81 }, { -17,  80 },
        { -18,  73 }, {  -4,  74 }, { -10,  83 }, {  -9,  71 },
        {  -9,  67 }, {  -1,  61 }, {  -8,  66 }, { -14,  66 },
        {   0,  59 }, {   2,  59 }, {   9,  -2 }, {  26,  -9 },
        {  33,  -9 }, {  39,  -7 }, {  41,  -2 }, {  45,   3 },
        {  49,   9 }, {  45,  27 }, {  36,  59 }, {  21, -13 },
        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
        {  -8,  66 }, {  -8,  76 }, {  -4,  79 }, {  -7,  71 },
        {  -5,  69 }, {  -9,  70 }, {  -8,  66 }, { -10,  68 },
        { -19,  73 }, { -12,  69 }, { -16,  70 }, { -15,  67 },
        { -20,  62 }, { -19,  70 }, { -16,  66 }, { -22,  65 },
        { -20,  63 }, {  -5,  85 }, {  -6,  81 }, { -10,  77 },
        {  -7,  81 }, { -17,  80 }, { -18,  73 }, {  -4,  74 },
        { -10,  83 }, {  -9,  71 }, {  -9,  67 }, {  -1,  61 },
        {  -8,  66 }, { -14,  66 }, {   0,  59 }, {   2,  59 },
        {   9,  -2 }, {  26,  -9 }, {  33,  -9 }, {  39,  -7 },
        {  41,  -2 }, {  45,   3 }, {  49,   9 }, {  45,  27 },
        {  36,  59 }, {  21, -13 }, {  33, -14 }, {  39,  -7 },
        {  46,  -2 }, {  51,   2 }, {  60,   6 }, {  61,  17 },
        {  55,  34 }, {  42,  62 }, {  -6,  66 }, {  -7,  35 },
        {  -7,  42 }, {  -8,  45 }, {  -5,  48 }, { -12,  56 },
        {  -6,  60 }, {  -5,  62 }, {  -8,  66 }, {  -8,  76 },
        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
        {  -3,  74 }, { -10,  90 }, {  -6,  76 }, {  -2,  44 },
        {   0,  45 }, {   0,  52 }, {  -3,  64 }, {  -2,  59 },
        {  -4,  70 }, {  -4,  75 }, {  -8,  82 }, { -17, 102 },
        {  -9,  77 }, {   3,  24 }, {   0,  42 }, {   0,  48 },
        {   0,  55 }, {  -6,  59 }, {  -7,  71 }, { -12,  83 },
        { -11,  87 }, { -30, 119 }, {   1,  58 }, {  -3,  29 },
        {  -1,  36 }, {   1,  38 }, {   2,  43 }, {  -6,  55 },
        {   0,  58 }, {   0,  64 }, {  -3,  74 }, { -10,  90 },
        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 }
    },

    /* i_cabac_init_idc == 1 */
    {
        /* 0 - 10 */
        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
        {  -6,  53 }, {  -1,  54 }, {   7,  51 },

        /* 11 - 23 */
        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
        {  10,  54 },

        /* 24 - 39 */
        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },

        /* 40 - 53 */
        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
        {  -7,  86 },{  -5,  95 },

        /* 54 - 59 */
        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
        {  -5,  72 },{   0,  61 },

        /* 60 - 69 */
        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
        { 13, 41 },  { 3, 62 },

        /* 70 - 104 */
        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
        {   0,  68 }, {  -7,  74 }, {  -9,  88 },

        /* 105 -> 165 */
        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
        {   0,  89 },

        /* 166 - 226 */
        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
        { -10, 116 },

        /* 227 - 275 */
        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
        {  -4,  78 },

        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
        { 0, 0 },

        /* 277 - 337 */
        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
        {  18,  50 },

        /* 338 - 398 */
        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
        {  11,  83 },

        /* 399 -> 435 */
        {  25,  32 }, {  21,  49 }, {  21,  54 },
        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
        {  -4,  67 }, {  -7,  82 },

        /* 436 -> 459 */
        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },

        /* 460 - 1024 */
        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  -3,  81 },
        {  -3,  76 }, {  -7,  72 }, {  -6,  78 }, { -12,  72 },
        { -14,  68 }, {  -3,  70 }, {  -6,  76 }, {  -5,  66 },
        {  -5,  62 }, {   0,  57 }, {  -4,  61 }, {  -9,  60 },
        {   1,  54 }, {   2,  58 }, {  17, -10 }, {  32, -13 },
        {  42,  -9 }, {  49,  -5 }, {  53,   0 }, {  64,   3 },
        {  68,  10 }, {  66,  27 }, {  47,  57 }, {  17, -10 },
        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
        {  -4,  67 }, {  -7,  82 }, {  -5,  85 }, {  -6,  81 },
        { -10,  77 }, {  -7,  81 }, { -17,  80 }, { -18,  73 },
        {  -4,  74 }, { -10,  83 }, {  -9,  71 }, {  -9,  67 },
        {  -1,  61 }, {  -8,  66 }, { -14,  66 }, {   0,  59 },
        {   2,  59 }, {  -3,  81 }, {  -3,  76 }, {  -7,  72 },
        {  -6,  78 }, { -12,  72 }, { -14,  68 }, {  -3,  70 },
        {  -6,  76 }, {  -5,  66 }, {  -5,  62 }, {   0,  57 },
        {  -4,  61 }, {  -9,  60 }, {   1,  54 }, {   2,  58 },
        {  17, -10 }, {  32, -13 }, {  42,  -9 }, {  49,  -5 },
        {  53,   0 }, {  64,   3 }, {  68,  10 }, {  66,  27 },
        {  47,  57 }, {  17, -10 }, {  32, -13 }, {  42,  -9 },
        {  49,  -5 }, {  53,   0 }, {  64,   3 }, {  68,  10 },
        {  66,  27 }, {  47,  57 }, {  -5,  71 }, {   0,  24 },
        {  -1,  36 }, {  -2,  42 }, {  -2,  52 }, {  -9,  57 },
        {  -6,  63 }, {  -4,  65 }, {  -4,  67 }, {  -7,  82 },
        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
        {  -5,  74 }, {  -9,  86 }, { -23, 112 }, { -15,  71 },
        {  -7,  61 }, {   0,  53 }, {  -5,  66 }, { -11,  77 },
        {  -9,  80 }, {  -9,  84 }, { -10,  87 }, { -34, 127 },
        { -21, 101 }, {  -3,  39 }, {  -5,  53 }, {  -7,  61 },
        { -11,  75 }, { -15,  77 }, { -17,  91 }, { -25, 107 },
        { -25, 111 }, { -28, 122 }, { -11,  76 }, { -10,  44 },
        { -10,  52 }, { -10,  57 }, {  -9,  58 }, { -16,  72 },
        {  -7,  69 }, {  -4,  69 }, {  -5,  74 }, {  -9,  86 },
        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 }
    },

    /* i_cabac_init_idc == 2 */
    {
        /* 0 - 10 */
        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
        {  -6,  53 }, {  -1,  54 }, {   7,  51 },

        /* 11 - 23 */
        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
        {  14,  57 },

        /* 24 - 39 */
        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },

        /* 40 - 53 */
        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
        {  -3,  90 },{  -1,  101 },

        /* 54 - 59 */
        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
        {  -7,  50 },{   1,  60 },

        /* 60 - 69 */
        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
        { 13, 41 },  { 3, 62 },

        /* 70 - 104 */
        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
        {   3,  68 }, {  -8,  71 }, { -13,  98 },

        /* 105 -> 165 */
        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
        { -22, 127 },

        /* 166 - 226 */
        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
        { -24, 127 },

        /* 227 - 275 */
        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
        { -10,  87 },

        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
        { 0, 0 },

        /* 277 - 337 */
        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
        {  25,  42 },

        /* 338 - 398 */
        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
        {  25,  61 },

        /* 399 -> 435 */
        {  21,  33 }, {  19,  50 }, {  17,  61 },
        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
        {  -6,  68 }, { -10,  79 },

        /* 436 -> 459 */
        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },

        /* 460 - 1024 */
        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {  -3,  78 },
        {  -8,  74 }, {  -9,  72 }, { -10,  72 }, { -18,  75 },
        { -12,  71 }, { -11,  63 }, {  -5,  70 }, { -17,  75 },
        { -14,  72 }, { -16,  67 }, {  -8,  53 }, { -14,  59 },
        {  -9,  52 }, { -11,  68 }, {   9,  -2 }, {  30, -10 },
        {  31,  -4 }, {  33,  -1 }, {  33,   7 }, {  31,  12 },
        {  37,  23 }, {  31,  38 }, {  20,  64 }, {   9,  -2 },
        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
        {  -6,  68 }, { -10,  79 }, {  -3,  78 }, {  -8,  74 },
        {  -9,  72 }, { -10,  72 }, { -18,  75 }, { -12,  71 },
        { -11,  63 }, {  -5,  70 }, { -17,  75 }, { -14,  72 },
        { -16,  67 }, {  -8,  53 }, { -14,  59 }, {  -9,  52 },
        { -11,  68 }, {  -3,  78 }, {  -8,  74 }, {  -9,  72 },
        { -10,  72 }, { -18,  75 }, { -12,  71 }, { -11,  63 },
        {  -5,  70 }, { -17,  75 }, { -14,  72 }, { -16,  67 },
        {  -8,  53 }, { -14,  59 }, {  -9,  52 }, { -11,  68 },
        {   9,  -2 }, {  30, -10 }, {  31,  -4 }, {  33,  -1 },
        {  33,   7 }, {  31,  12 }, {  37,  23 }, {  31,  38 },
        {  20,  64 }, {   9,  -2 }, {  30, -10 }, {  31,  -4 },
        {  33,  -1 }, {  33,   7 }, {  31,  12 }, {  37,  23 },
        {  31,  38 }, {  20,  64 }, {  -9,  71 }, {  -7,  37 },
        {  -8,  44 }, { -11,  49 }, { -10,  56 }, { -12,  59 },
        {  -8,  63 }, {  -9,  67 }, {  -6,  68 }, { -10,  79 },
        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
        { -12,  92 }, { -18, 108 }, { -24, 115 }, { -22,  82 },
        {  -9,  62 }, {   0,  53 }, {   0,  59 }, { -14,  85 },
        { -13,  89 }, { -13,  94 }, { -11,  92 }, { -29, 127 },
        { -21, 100 }, { -14,  57 }, { -12,  67 }, { -11,  71 },
        { -10,  77 }, { -21,  85 }, { -16,  88 }, { -23, 104 },
        { -15,  98 }, { -37, 127 }, { -10,  82 }, {  -8,  48 },
        {  -8,  61 }, {  -8,  66 }, {  -7,  70 }, { -14,  75 },
        { -10,  79 }, {  -9,  83 }, { -12,  92 }, { -18, 108 },
        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 }
    }
};

const uint8_t x264_cabac_range_lps[64][4] =
{
    {  2,   2,   2,   2}, {  6,   7,   8,   9}, {  6,   7,   9,  10}, {  6,   8,   9,  11},
    {  7,   8,  10,  11}, {  7,   9,  10,  12}, {  7,   9,  11,  12}, {  8,   9,  11,  13},
    {  8,  10,  12,  14}, {  9,  11,  12,  14}, {  9,  11,  13,  15}, { 10,  12,  14,  16},
    { 10,  12,  15,  17}, { 11,  13,  15,  18}, { 11,  14,  16,  19}, { 12,  14,  17,  20},
    { 12,  15,  18,  21}, { 13,  16,  19,  22}, { 14,  17,  20,  23}, { 14,  18,  21,  24},
    { 15,  19,  22,  25}, { 16,  20,  23,  27}, { 17,  21,  25,  28}, { 18,  22,  26,  30},
    { 19,  23,  27,  31}, { 20,  24,  29,  33}, { 21,  26,  30,  35}, { 22,  27,  32,  37},
    { 23,  28,  33,  39}, { 24,  30,  35,  41}, { 26,  31,  37,  43}, { 27,  33,  39,  45},
    { 29,  35,  41,  48}, { 30,  37,  43,  50}, { 32,  39,  46,  53}, { 33,  41,  48,  56},
    { 35,  43,  51,  59}, { 37,  45,  54,  62}, { 39,  48,  56,  65}, { 41,  50,  59,  69},
    { 43,  53,  63,  72}, { 46,  56,  66,  76}, { 48,  59,  69,  80}, { 51,  62,  73,  85},
    { 53,  65,  77,  89}, { 56,  69,  81,  94}, { 59,  72,  86,  99}, { 62,  76,  90, 104},
    { 66,  80,  95, 110}, { 69,  85, 100, 116}, { 73,  89, 105, 122}, { 77,  94, 111, 128},
    { 81,  99, 117, 135}, { 85, 104, 123, 142}, { 90, 110, 130, 150}, { 95, 116, 137, 158},
    {100, 122, 144, 166}, {105, 128, 152, 175}, {111, 135, 160, 185}, {116, 142, 169, 195},
    {123, 150, 178, 205}, {128, 158, 187, 216}, {128, 167, 197, 227}, {128, 176, 208, 240}
};

const uint8_t x264_cabac_transition[128][2] =
{
    {  0,   0}, {  1,   1}, {  2,  50}, { 51,   3}, {  2,  50}, { 51,   3}, {  4,  52}, { 53,   5},
    {  6,  52}, { 53,   7}, {  8,  52}, { 53,   9}, { 10,  54}, { 55,  11}, { 12,  54}, { 55,  13},
    { 14,  54}, { 55,  15}, { 16,  56}, { 57,  17}, { 18,  56}, { 57,  19}, { 20,  56}, { 57,  21},
    { 22,  58}, { 59,  23}, { 24,  58}, { 59,  25}, { 26,  60}, { 61,  27}, { 28,  60}, { 61,  29},
    { 30,  60}, { 61,  31}, { 32,  62}, { 63,  33}, { 34,  62}, { 63,  35}, { 36,  64}, { 65,  37},
    { 38,  66}, { 67,  39}, { 40,  66}, { 67,  41}, { 42,  66}, { 67,  43}, { 44,  68}, { 69,  45},
    { 46,  68}, { 69,  47}, { 48,  70}, { 71,  49}, { 50,  72}, { 73,  51}, { 52,  72}, { 73,  53},
    { 54,  74}, { 75,  55}, { 56,  74}, { 75,  57}, { 58,  76}, { 77,  59}, { 60,  78}, { 79,  61},
    { 62,  78}, { 79,  63}, { 64,  80}, { 81,  65}, { 66,  82}, { 83,  67}, { 68,  82}, { 83,  69},
    { 70,  84}, { 85,  71}, { 72,  84}, { 85,  73}, { 74,  88}, { 89,  75}, { 76,  88}, { 89,  77},
    { 78,  90}, { 91,  79}, { 80,  90}, { 91,  81}, { 82,  94}, { 95,  83}, { 84,  94}, { 95,  85},
    { 86,  96}, { 97,  87}, { 88,  96}, { 97,  89}, { 90, 100}, {101,  91}, { 92, 100}, {101,  93},
    { 94, 102}, {103,  95}, { 96, 104}, {105,  97}, { 98, 104}, {105,  99}, {100, 108}, {109, 101},
    {102, 108}, {109, 103}, {104, 110}, {111, 105}, {106, 112}, {113, 107}, {108, 114}, {115, 109},
    {110, 116}, {117, 111}, {112, 118}, {119, 113}, {114, 118}, {119, 115}, {116, 122}, {123, 117},
    {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125}
};

const uint8_t x264_cabac_renorm_shift[64] =
{
    6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};

/* -ln2(probability) */
const uint16_t x264_cabac_entropy[128] =
{
    FIX8(0.0273), FIX8(5.7370), FIX8(0.0288), FIX8(5.6618),
    FIX8(0.0303), FIX8(5.5866), FIX8(0.0320), FIX8(5.5114),
    FIX8(0.0337), FIX8(5.4362), FIX8(0.0355), FIX8(5.3610),
    FIX8(0.0375), FIX8(5.2859), FIX8(0.0395), FIX8(5.2106),
    FIX8(0.0416), FIX8(5.1354), FIX8(0.0439), FIX8(5.0602),
    FIX8(0.0463), FIX8(4.9851), FIX8(0.0488), FIX8(4.9099),
    FIX8(0.0515), FIX8(4.8347), FIX8(0.0543), FIX8(4.7595),
    FIX8(0.0572), FIX8(4.6843), FIX8(0.0604), FIX8(4.6091),
    FIX8(0.0637), FIX8(4.5339), FIX8(0.0671), FIX8(4.4588),
    FIX8(0.0708), FIX8(4.3836), FIX8(0.0747), FIX8(4.3083),
    FIX8(0.0788), FIX8(4.2332), FIX8(0.0832), FIX8(4.1580),
    FIX8(0.0878), FIX8(4.0828), FIX8(0.0926), FIX8(4.0076),
    FIX8(0.0977), FIX8(3.9324), FIX8(0.1032), FIX8(3.8572),
    FIX8(0.1089), FIX8(3.7820), FIX8(0.1149), FIX8(3.7068),
    FIX8(0.1214), FIX8(3.6316), FIX8(0.1282), FIX8(3.5565),
    FIX8(0.1353), FIX8(3.4813), FIX8(0.1429), FIX8(3.4061),
    FIX8(0.1510), FIX8(3.3309), FIX8(0.1596), FIX8(3.2557),
    FIX8(0.1686), FIX8(3.1805), FIX8(0.1782), FIX8(3.1053),
    FIX8(0.1884), FIX8(3.0301), FIX8(0.1992), FIX8(2.9549),
    FIX8(0.2107), FIX8(2.8797), FIX8(0.2229), FIX8(2.8046),
    FIX8(0.2358), FIX8(2.7294), FIX8(0.2496), FIX8(2.6542),
    FIX8(0.2642), FIX8(2.5790), FIX8(0.2798), FIX8(2.5038),
    FIX8(0.2964), FIX8(2.4286), FIX8(0.3142), FIX8(2.3534),
    FIX8(0.3331), FIX8(2.2782), FIX8(0.3532), FIX8(2.2030),
    FIX8(0.3748), FIX8(2.1278), FIX8(0.3979), FIX8(2.0527),
    FIX8(0.4226), FIX8(1.9775), FIX8(0.4491), FIX8(1.9023),
    FIX8(0.4776), FIX8(1.8271), FIX8(0.5082), FIX8(1.7519),
    FIX8(0.5412), FIX8(1.6767), FIX8(0.5768), FIX8(1.6015),
    FIX8(0.6152), FIX8(1.5263), FIX8(0.6568), FIX8(1.4511),
    FIX8(0.7020), FIX8(1.3759), FIX8(0.7513), FIX8(1.3008),
    FIX8(0.8050), FIX8(1.2256), FIX8(0.8638), FIX8(1.1504),
    FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000)
};

uint8_t x264_cabac_contexts[4][QP_MAX_SPEC+1][1024];

void x264_cabac_init( x264_t *h )
{
    int ctx_count = CHROMA444 ? 1024 : 460;
    for( int i = 0; i < 4; i++ )
    {
        const int8_t (*cabac_context_init)[1024][2] = i == 0 ? &x264_cabac_context_init_I
                                                             : &x264_cabac_context_init_PB[i-1];
        for( int qp = 0; qp <= QP_MAX_SPEC; qp++ )
            for( int j = 0; j < ctx_count; j++ )
            {
                int state = x264_clip3( (((*cabac_context_init)[j][0] * qp) >> 4) + (*cabac_context_init)[j][1], 1, 126 );
                x264_cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
            }
    }
}

/*****************************************************************************
 *
 *****************************************************************************/
void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
{
    memcpy( cb->state, x264_cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], CHROMA444 ? 1024 : 460 );
}

void x264_cabac_encode_init_core( x264_cabac_t *cb )
{
    cb->i_low   = 0;
    cb->i_range = 0x01FE;
    cb->i_queue = -9; // the first bit will be shifted away and not written
    cb->i_bytes_outstanding = 0;
}

void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
{
    x264_cabac_encode_init_core( cb );
    cb->p_start = p_data;
    cb->p       = p_data;
    cb->p_end   = p_end;
}

static inline void x264_cabac_putbyte( x264_cabac_t *cb )
{
    if( cb->i_queue >= 0 )
    {
        int out = cb->i_low >> (cb->i_queue+10);
        cb->i_low &= (0x400<<cb->i_queue)-1;
        cb->i_queue -= 8;

        if( (out & 0xff) == 0xff )
            cb->i_bytes_outstanding++;
        else
        {
            int carry = out >> 8;
            int bytes_outstanding = cb->i_bytes_outstanding;
            // this can't modify before the beginning of the stream because
            // that would correspond to a probability > 1.
            // it will write before the beginning of the stream, which is ok
            // because a slice header always comes before cabac data.
            // this can't carry beyond the one byte, because any 0xff bytes
            // are in bytes_outstanding and thus not written yet.
            cb->p[-1] += carry;
            while( bytes_outstanding > 0 )
            {
                *(cb->p++) = carry-1;
                bytes_outstanding--;
            }
            *(cb->p++) = out;
            cb->i_bytes_outstanding = 0;
        }
    }
}

static inline void x264_cabac_encode_renorm( x264_cabac_t *cb )
{
    int shift = x264_cabac_renorm_shift[cb->i_range>>3];
    cb->i_range <<= shift;
    cb->i_low   <<= shift;
    cb->i_queue  += shift;
    x264_cabac_putbyte( cb );
}

/* Making custom versions of this function, even in asm, for the cases where
 * b is known to be 0 or 1, proved to be somewhat useful on x86_32 with GCC 3.4
 * but nearly useless with GCC 4.3 and worse than useless on x86_64. */
void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
{
    int i_state = cb->state[i_ctx];
    int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4];
    cb->i_range -= i_range_lps;
    if( b != (i_state & 1) )
    {
        cb->i_low += cb->i_range;
        cb->i_range = i_range_lps;
    }
    cb->state[i_ctx] = x264_cabac_transition[i_state][b];
    x264_cabac_encode_renorm( cb );
}

/* Note: b is negated for this function */
void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
{
    cb->i_low <<= 1;
    cb->i_low += b & cb->i_range;
    cb->i_queue += 1;
    x264_cabac_putbyte( cb );
}

static const int bypass_lut[16] =
{
    -1,      0x2,     0x14,     0x68,     0x1d0,     0x7a0,     0x1f40,     0x7e80,
    0x1fd00, 0x7fa00, 0x1ff400, 0x7fe800, 0x1ffd000, 0x7ffa000, 0x1fff4000, 0x7ffe8000
};

void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
{
    uint32_t v = val + (1<<exp_bits);
    int k = 31 - x264_clz( v );
    uint32_t x = (bypass_lut[k-exp_bits]<<exp_bits) + v;
    k = 2*k+1-exp_bits;
    int i = ((k-1)&7)+1;
    do {
        k -= i;
        cb->i_low <<= i;
        cb->i_low += ((x>>k)&0xff) * cb->i_range;
        cb->i_queue += i;
        x264_cabac_putbyte( cb );
        i = 8;
    } while( k > 0 );
}

void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
{
    cb->i_range -= 2;
    x264_cabac_encode_renorm( cb );
}

void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb )
{
    cb->i_low += cb->i_range - 2;
    cb->i_low |= 1;
    cb->i_low <<= 9;
    cb->i_queue += 9;
    x264_cabac_putbyte( cb );
    x264_cabac_putbyte( cb );
    cb->i_low <<= -cb->i_queue;
    cb->i_low |= (0x35a4e4f5 >> (h->i_frame & 31) & 1) << 10;
    cb->i_queue = 0;
    x264_cabac_putbyte( cb );

    while( cb->i_bytes_outstanding > 0 )
    {
        *(cb->p++) = 0xff;
        cb->i_bytes_outstanding--;
    }
}

x264-snapshot-20120103-2245-stable/common/bitstream.h0000644000175000001440000002023711700673342021330 0ustar  videolanusers/*****************************************************************************
 * bitstream.h: bitstream writing
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_BS_H
#define X264_BS_H

typedef struct
{
    uint8_t i_bits;
    uint8_t i_size;
} vlc_t;

typedef struct
{
    uint16_t i_bits;
    uint8_t  i_size;
    /* Next level table to use */
    uint8_t  i_next;
} vlc_large_t;

typedef struct bs_s
{
    uint8_t *p_start;
    uint8_t *p;
    uint8_t *p_end;

    uintptr_t cur_bits;
    int     i_left;    /* i_count number of available bits */
    int     i_bits_encoded; /* RD only */
} bs_t;

typedef struct
{
    int     last;
    dctcoef level[16];
    uint8_t run[16];
} x264_run_level_t;

extern const vlc_t x264_coeff0_token[6];
extern const vlc_t x264_coeff_token[6][16][4];
extern const vlc_t x264_total_zeros[15][16];
extern const vlc_t x264_total_zeros_2x2_dc[3][4];
extern const vlc_t x264_total_zeros_2x4_dc[7][8];
extern const vlc_t x264_run_before[7][16];

typedef struct
{
    uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
} x264_bitstream_function_t;

void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );

/* A larger level table size theoretically could help a bit at extremely
 * high bitrates, but the cost in cache is usually too high for it to be
 * useful.
 * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
 * FIXME: Do further testing? */
#define LEVEL_TABLE_SIZE 128
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];

static inline void bs_init( bs_t *s, void *p_data, int i_data )
{
    int offset = ((intptr_t)p_data & 3);
    s->p       = s->p_start = (uint8_t*)p_data - offset;
    s->p_end   = (uint8_t*)p_data + i_data;
    s->i_left  = (WORD_SIZE - offset)*8;
    s->cur_bits = endian_fix32( M32(s->p) );
    s->cur_bits >>= (4-offset)*8;
}
static inline int bs_pos( bs_t *s )
{
    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
}

/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
static inline void bs_flush( bs_t *s )
{
    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
    s->p += WORD_SIZE - (s->i_left >> 3);
    s->i_left = WORD_SIZE*8;
}
/* The inverse of bs_flush: prepare the bitstream to be written to again. */
static inline void bs_realign( bs_t *s )
{
    int offset = ((intptr_t)s->p & 3);
    if( offset )
    {
        s->p       = (uint8_t*)s->p - offset;
        s->i_left  = (WORD_SIZE - offset)*8;
        s->cur_bits = endian_fix32( M32(s->p) );
        s->cur_bits >>= (4-offset)*8;
    }
}

static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
{
    if( WORD_SIZE == 8 )
    {
        s->cur_bits = (s->cur_bits << i_count) | i_bits;
        s->i_left -= i_count;
        if( s->i_left <= 32 )
        {
#if WORDS_BIGENDIAN
            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
#else
            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
#endif
            s->i_left += 32;
            s->p += 4;
        }
    }
    else
    {
        if( i_count < s->i_left )
        {
            s->cur_bits = (s->cur_bits << i_count) | i_bits;
            s->i_left -= i_count;
        }
        else
        {
            i_count -= s->i_left;
            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
            M32( s->p ) = endian_fix( s->cur_bits );
            s->p += 4;
            s->cur_bits = i_bits;
            s->i_left = 32 - i_count;
        }
    }
}

/* Special case to eliminate branch in normal bs_write. */
/* Golomb never writes an even-size code, so this is only used in slice headers. */
static inline void bs_write32( bs_t *s, uint32_t i_bits )
{
    bs_write( s, 16, i_bits >> 16 );
    bs_write( s, 16, i_bits );
}

static inline void bs_write1( bs_t *s, uint32_t i_bit )
{
    s->cur_bits <<= 1;
    s->cur_bits |= i_bit;
    s->i_left--;
    if( s->i_left == WORD_SIZE*8-32 )
    {
        M32( s->p ) = endian_fix32( s->cur_bits );
        s->p += 4;
        s->i_left = WORD_SIZE*8;
    }
}

static inline void bs_align_0( bs_t *s )
{
    bs_write( s, s->i_left&7, 0 );
    bs_flush( s );
}
static inline void bs_align_1( bs_t *s )
{
    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
    bs_flush( s );
}
static inline void bs_align_10( bs_t *s )
{
    if( s->i_left&7 )
        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
}

/* golomb functions */

static const uint8_t x264_ue_size_tab[256] =
{
     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
};

static inline void bs_write_ue_big( bs_t *s, unsigned int val )
{
    int size = 0;
    int tmp = ++val;
    if( tmp >= 0x10000 )
    {
        size = 32;
        tmp >>= 16;
    }
    if( tmp >= 0x100 )
    {
        size += 16;
        tmp >>= 8;
    }
    size += x264_ue_size_tab[tmp];
    bs_write( s, size>>1, 0 );
    bs_write( s, (size>>1)+1, val );
}

/* Only works on values under 255. */
static inline void bs_write_ue( bs_t *s, int val )
{
    bs_write( s, x264_ue_size_tab[val+1], val+1 );
}

static inline void bs_write_se( bs_t *s, int val )
{
    int size = 0;
    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
    /* 4 instructions on x86, 3 on ARM */
    int tmp = 1 - val*2;
    if( tmp < 0 ) tmp = val*2;
    val = tmp;

    if( tmp >= 0x100 )
    {
        size = 16;
        tmp >>= 8;
    }
    size += x264_ue_size_tab[tmp];
    bs_write( s, size, val );
}

static inline void bs_write_te( bs_t *s, int x, int val )
{
    if( x == 1 )
        bs_write1( s, 1^val );
    else //if( x > 1 )
        bs_write_ue( s, val );
}

static inline void bs_rbsp_trailing( bs_t *s )
{
    bs_write1( s, 1 );
    bs_write( s, s->i_left&7, 0  );
}

static ALWAYS_INLINE int bs_size_ue( unsigned int val )
{
    return x264_ue_size_tab[val+1];
}

static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
{
    if( val < 255 )
        return x264_ue_size_tab[val+1];
    else
        return x264_ue_size_tab[(val+1)>>8] + 16;
}

static ALWAYS_INLINE int bs_size_se( int val )
{
    int tmp = 1 - val*2;
    if( tmp < 0 ) tmp = val*2;
    if( tmp < 256 )
        return x264_ue_size_tab[tmp];
    else
        return x264_ue_size_tab[tmp>>8]+16;
}

static ALWAYS_INLINE int bs_size_te( int x, int val )
{
    if( x == 1 )
        return 1;
    else //if( x > 1 )
        return x264_ue_size_tab[val+1];
}

#endif
x264-snapshot-20120103-2245-stable/common/bitstream.c0000644000175000001440000000645711700673342021333 0ustar  videolanusers/*****************************************************************************
 * bitstream.c: bitstream writing
 *****************************************************************************
 * Copyright (C) 2003-2011 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Jason Garrett-Glaser <darkshikari@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common.h"

static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
{
    if( src < end ) *dst++ = *src++;
    if( src < end ) *dst++ = *src++;
    while( src < end )
    {
        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
            *dst++ = 0x03;
        *dst++ = *src++;
    }
    return dst;
}

#if HAVE_MMX
uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif

/****************************************************************************
 * x264_nal_encode:
 ****************************************************************************/
void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
{
    uint8_t *src = nal->p_payload;
    uint8_t *end = nal->p_payload + nal->i_payload;
    uint8_t *orig_dst = dst;

    if( h->param.b_annexb )
    {
        if( nal->b_long_startcode )
            *dst++ = 0x00;
        *dst++ = 0x00;
        *dst++ = 0x00;
        *dst++ = 0x01;
    }
    else /* save room for size later */
        dst += 4;

    /* nal header */
    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;

    dst = h->bsf.nal_escape( dst, src, end );
    int size = (dst - orig_dst) - 4;

    /* Write the size header for mp4/etc */
    if( !h->param.b_annexb )
    {
        /* Size doesn't include the size of the header we're writing now. */
        orig_dst[0] = size>>24;
        orig_dst[1] = size>>16;
        orig_dst[2] = size>> 8;
        orig_dst[3] = size>> 0;
    }

    nal->i_payload = size+4;
    nal->p_payload = orig_dst;
    x264_emms();
}

void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
{
    pf->nal_escape = x264_nal_escape_c;
#if HAVE_MMX
    if( cpu&X264_CPU_MMX2 )
        pf->nal_escape = x264_nal_escape_mmx2;
    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
        pf->nal_escape = x264_nal_escape_sse2;
    if( cpu&X264_CPU_AVX )
        pf->nal_escape = x264_nal_escape_avx;
#endif
}
x264-snapshot-20120103-2245-stable/common/arm/0000755000175000001440000000000011700673342017740 5ustar  videolanusersx264-snapshot-20120103-2245-stable/common/arm/quant.h0000644000175000001440000000377511700673342021255 0ustar  videolanusers/*****************************************************************************
 * quant.h: arm quantization and level-run
 *****************************************************************************
 * Copyright (C) 2005-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ARM_QUANT_H
#define X264_ARM_QUANT_H

int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );

int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );

void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );

int x264_coeff_last4_arm( int16_t * );
int x264_coeff_last15_neon( int16_t * );
int x264_coeff_last16_neon( int16_t * );
int x264_coeff_last64_neon( int16_t * );

#endif
x264-snapshot-20120103-2245-stable/common/arm/quant-a.S0000644000175000001440000002236211700673342021437 0ustar  videolanusers/****************************************************************************
 * quant.S: arm quantization and level-run
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "asm.S"

.fpu neon

.section .rodata
.align 4
pmovmskb_byte:
.byte 1,2,4,8,16,32,64,128
.byte 1,2,4,8,16,32,64,128

.text

.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
    vadd.u16    q8,  q8,  \bias0
    vadd.u16    q9,  q9,  \bias1
.ifc \load_mf, yes
    vld1.64     {\mf0-\mf3}, [r1,:128]!
.endif
    vmull.u16   q10, d16, \mf0
    vmull.u16   q11, d17, \mf1
    vmull.u16   q12, d18, \mf2
    vmull.u16   q13, d19, \mf3
    vshr.s16    q14, q14, #15
    vshr.s16    q15, q15, #15
    vshrn.u32   d16, q10, #16
    vshrn.u32   d17, q11, #16
    vshrn.u32   d18, q12, #16
    vshrn.u32   d19, q13, #16
    veor        q8,  q8,  q14
    veor        q9,  q9,  q15
    vsub.s16    q8,  q8,  q14
    vsub.s16    q9,  q9,  q15
    vorr        \bias0, q8,  q9
    vst1.64     {d16-d19}, [r0,:128]!
.endm

.macro QUANT_END d
    vmov        r2,  r3,  \d
    orrs        r0,  r2,  r3
    movne       r0,  #1
    bx          lr
.endm

// quant_2x2_dc( int16_t dct[4], int mf, int bias )
function x264_quant_2x2_dc_neon
    vld1.64     {d0}, [r0,:64]
    vabs.s16    d3,  d0
    vdup.16     d2,  r2
    vdup.16     d1,  r1
    vadd.u16    d3,  d3,  d2
    vmull.u16   q3,  d3,  d1
    vshr.s16    d0,  d0,  #15
    vshrn.u32   d3,  q3,  #16
    veor        d3,  d3,  d0
    vsub.s16    d3,  d3,  d0
    vst1.64     {d3}, [r0,:64]
    QUANT_END   d3
.endfunc

// quant_4x4_dc( int16_t dct[16], int mf, int bias )
function x264_quant_4x4_dc_neon
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vdup.16     q0,  r2
    vdup.16     q2,  r1
    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5
    vorr        d0,  d0,  d1
    QUANT_END   d0
.endfunc

// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
function x264_quant_4x4_neon
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vld1.64     {d0-d3}, [r2,:128]
    vld1.64     {d4-d7}, [r1,:128]
    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
    vorr        d0,  d0,  d1
    QUANT_END   d0
.endfunc

// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
function x264_quant_8x8_neon
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vld1.64     {d0-d3},   [r2,:128]!
    vld1.64     {d4-d7},   [r1,:128]!
    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
.rept 3
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vld1.64     {d2-d5},   [r2,:128]!
    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7, yes
    vorr        q0,  q0,  q1
.endr
    vorr        d0,  d0,  d1
    QUANT_END   d0
.endfunc

.macro DEQUANT_START mf_size offset dc=no
    mov         r3,  #0x2b
    mul         r3,  r3,  r2
    lsr         r3,  r3,  #8            // i_qbits = i_qp / 6
    add         ip,  r3,  r3,  lsl #1
    sub         r2,  r2,  ip,  lsl #1   // i_mf = i_qp % 6
.ifc \dc,no
    add         r1,  r1,  r2, lsl #\mf_size  // dequant_mf[i_mf]
.else
    ldr         r1, [r1,  r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
.endif
    subs        r3,  r3,  #\offset      // 6 for 8x8
.endm

// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
.macro DEQUANT size bits
function x264_dequant_\size\()_neon
    DEQUANT_START \bits+2, \bits
.ifc \size, 8x8
    mov         r2,  #4
.endif
    blt         dequant_\size\()_rshift

    vdup.16     q15, r3
dequant_\size\()_lshift_loop:
.ifc \size, 8x8
    subs        r2,  r2,  #1
.endif
    vld1.32     {d16-d17}, [r1,:128]!
    vld1.32     {d18-d19}, [r1,:128]!
    vmovn.s32   d4,  q8
    vld1.32     {d20-d21}, [r1,:128]!
    vmovn.s32   d5,  q9
    vld1.32     {d22-d23}, [r1,:128]!
    vmovn.s32   d6,  q10
    vld1.16     {d0-d3},   [r0,:128]
    vmovn.s32   d7,  q11
    vmul.s16    q0,  q0,  q2
    vmul.s16    q1,  q1,  q3
    vshl.s16    q0,  q0,  q15
    vshl.s16    q1,  q1,  q15
    vst1.16     {d0-d3},   [r0,:128]!
.ifc \size, 8x8
    bgt         dequant_\size\()_lshift_loop
.endif
    bx          lr

dequant_\size\()_rshift:
    vdup.32     q15, r3
    rsb         r3,  r3,  #0
    mov         ip,  #1
    sub         r3,  r3,  #1
    lsl         ip,  ip,  r3

.ifc \size, 8x8
dequant_\size\()_rshift_loop:
    subs        r2,  r2,  #1
.endif
    vdup.32     q10, ip
    vld1.32     {d16-d17}, [r1,:128]!
    vdup.32     q11, ip
    vld1.32     {d18-d19}, [r1,:128]!
    vmovn.s32   d4,  q8
    vld1.32     {d16-d17}, [r1,:128]!
    vmovn.s32   d5,  q9
    vld1.32     {d18-d19}, [r1,:128]!
    vmovn.s32   d6,  q8
    vld1.16     {d0-d3},   [r0,:128]
    vmovn.s32   d7,  q9
    vdup.32     q12, ip
    vdup.32     q13, ip

    vmlal.s16   q10, d0,  d4
    vmlal.s16   q11, d1,  d5
    vmlal.s16   q12, d2,  d6
    vmlal.s16   q13, d3,  d7
    vshl.s32    q10, q10, q15
    vshl.s32    q11, q11, q15
    vshl.s32    q12, q12, q15
    vshl.s32    q13, q13, q15

    vmovn.s32   d0,  q10
    vmovn.s32   d1,  q11
    vmovn.s32   d2,  q12
    vmovn.s32   d3,  q13
    vst1.16     {d0-d3},   [r0,:128]!
.ifc \size, 8x8
    bgt         dequant_\size\()_rshift_loop
.endif
    bx          lr
.endfunc
.endm

DEQUANT 4x4, 4
DEQUANT 8x8, 6

// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
function x264_dequant_4x4_dc_neon
    DEQUANT_START 6, 6, yes
    blt         dequant_4x4_dc_rshift

    lsl         r1,  r1,  r3
    vdup.16     q2,  r1
    vld1.16     {d0-d3},   [r0,:128]
    vdup.16     q15, r3

    vmul.s16    q0,  q0,  q2
    vmul.s16    q1,  q1,  q2
    vst1.16     {d0-d3},   [r0,:128]
    bx          lr

dequant_4x4_dc_rshift:
    vdup.16     d4,  r1
    vdup.32     q15, r3
    rsb         r3,  r3,  #0
    mov         ip,  #1
    sub         r3,  r3,  #1
    lsl         ip,  ip,  r3

    vdup.32     q10, ip
    vdup.32     q11, ip
    vld1.16     {d0-d3},   [r0,:128]
    vdup.32     q12, ip
    vdup.32     q13, ip

    vmlal.s16   q10, d0,  d4
    vmlal.s16   q11, d1,  d4
    vmlal.s16   q12, d2,  d4
    vmlal.s16   q13, d3,  d4
    vshl.s32    q10, q10, q15
    vshl.s32    q11, q11, q15
    vshl.s32    q12, q12, q15
    vshl.s32    q13, q13, q15

    vmovn.s32   d0,  q10
    vmovn.s32   d1,  q11
    vmovn.s32   d2,  q12
    vmovn.s32   d3,  q13
    vst1.16     {d0-d3},   [r0,:128]
    bx          lr
.endfunc


// int coeff_last( int16_t *l )
function x264_coeff_last4_arm
    ldrd        r2,  [r0]
    subs        r0,  r3,  #0
    movne       r0,  #2
    movne       r2,  r3
    lsrs        r2,  r2,  #16
    addne       r0,  r0,  #1
    bx          lr
.endfunc

.macro COEFF_LAST_1x size
function x264_coeff_last\size\()_neon
.if \size == 15
    sub         r0,  r0,  #2
    vld1.64     {d0-d3}, [r0]
.else
    vld1.64     {d0-d3}, [r0,:128]
.endif
    vtst.16     q0,  q0
    vtst.16     q1,  q1
    vshrn.u16   d0,  q0,  #8
    vshrn.u16   d1,  q1,  #8
    vshrn.u16   d0,  q0,  #4
    vclz.i32    d0,  d0
    mov         ip,  #7
    mov         r3,  #\size - 9
    vmov        r0,  r1,  d0

    subs        r1,  ip,  r1,  lsr #2
    addge       r0,  r1,  #\size - 8
    sublts      r0,  r3,  r0,  lsr #2
    movlt       r0,  #0
    bx          lr
.endfunc
.endm

COEFF_LAST_1x 15
COEFF_LAST_1x 16

function x264_coeff_last64_neon
    vld1.64     {d16-d19}, [r0,:128]!
    vqmovn.u16  d16, q8
    vqmovn.u16  d17, q9
    vld1.64     {d20-d23}, [r0,:128]!
    vqmovn.u16  d18, q10
    vqmovn.u16  d19, q11
    vld1.64     {d24-d27}, [r0,:128]!
    vqmovn.u16  d20, q12
    vqmovn.u16  d21, q13
    vld1.64     {d28-d31}, [r0,:128]!
    vqmovn.u16  d22, q14
    vqmovn.u16  d23, q15

    movrel      r1, pmovmskb_byte
    vld1.64     {d0-d1}, [r1,:128]

    vtst.8      q8,  q8
    vtst.8      q9,  q9
    vtst.8      q10, q10
    vtst.8      q11, q11

    vand        q8,  q8,  q0
    vand        q9,  q9,  q0
    vand        q10, q10, q0
    vand        q11, q11, q0

    vpadd.u8    d0,  d16, d17
    vpadd.u8    d1,  d18, d19
    vpadd.u8    d2,  d20, d21
    vpadd.u8    d3,  d22, d23
    vpadd.u8    d0,  d0,  d1
    vpadd.u8    d1,  d2,  d3
    vpadd.u8    d0,  d0,  d1
    vclz.i32    d0,  d0
    mov         ip,  #31
    vmov        r0,  r1,  d0

    subs        r1,  ip,  r1
    addge       r0,  r1,  #32
    sublts      r0,  ip,  r0
    movlt       r0,  #0
    bx          lr
.endfunc
x264-snapshot-20120103-2245-stable/common/arm/predict.h0000644000175000001440000000306611700673342021550 0ustar  videolanusers/*****************************************************************************
 * predict.h: arm intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ARM_PREDICT_H
#define X264_ARM_PREDICT_H

void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );

#endif
x264-snapshot-20120103-2245-stable/common/arm/predict-c.c0000644000175000001440000000756311700673342021771 0ustar  videolanusers/*****************************************************************************
 * predict.c: arm intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "predict.h"
#include "pixel.h"

void x264_predict_4x4_dc_armv6( uint8_t *src );
void x264_predict_4x4_h_armv6( uint8_t *src );
void x264_predict_4x4_ddr_armv6( uint8_t *src );
void x264_predict_4x4_ddl_neon( uint8_t *src );

void x264_predict_8x8c_dc_neon( uint8_t *src );
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
void x264_predict_8x8c_h_neon( uint8_t *src );
void x264_predict_8x8c_v_neon( uint8_t *src );
void x264_predict_8x8c_p_neon( uint8_t *src );

void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );

void x264_predict_16x16_dc_neon( uint8_t *src );
void x264_predict_16x16_dc_top_neon( uint8_t *src );
void x264_predict_16x16_dc_left_neon( uint8_t *src );
void x264_predict_16x16_h_neon( uint8_t *src );
void x264_predict_16x16_v_neon( uint8_t *src );
void x264_predict_16x16_p_neon( uint8_t *src );

void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
{
    if (!(cpu&X264_CPU_ARMV6))
        return;

#if !HIGH_BIT_DEPTH
    pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;

    if (!(cpu&X264_CPU_NEON))
        return;

    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
#endif // !HIGH_BIT_DEPTH
}

void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
{
    if (!(cpu&X264_CPU_NEON))
        return;

#if !HIGH_BIT_DEPTH
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
    pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
    pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
    pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
#endif // !HIGH_BIT_DEPTH
}

void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
    if (!(cpu&X264_CPU_NEON))
        return;

#if !HIGH_BIT_DEPTH
    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
#endif // !HIGH_BIT_DEPTH
}

void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
{
    if (!(cpu&X264_CPU_NEON))
        return;

#if !HIGH_BIT_DEPTH
    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
    pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
#endif // !HIGH_BIT_DEPTH
}
x264-snapshot-20120103-2245-stable/common/arm/predict-a.S0000644000175000001440000003165711700673342021750 0ustar  videolanusers/*****************************************************************************
 * predict.S: arm intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Mans Rullgard <mans@mansr.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "asm.S"

.fpu neon

.section .rodata
.align 4

p16weight: .short 1,2,3,4,5,6,7,8

.text

.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
.if \n == 8 || \hi == 0
    vld1.8          {\rd[0]}, [\rs], \rt
    vld1.8          {\rd[1]}, [\rs], \rt
    vld1.8          {\rd[2]}, [\rs], \rt
    vld1.8          {\rd[3]}, [\rs], \rt
.endif
.if \n == 8 || \hi == 1
    vld1.8          {\rd[4]}, [\rs], \rt
    vld1.8          {\rd[5]}, [\rs], \rt
    vld1.8          {\rd[6]}, [\rs], \rt
    vld1.8          {\rd[7]}, [\rs], \rt
.endif
.endm

.macro add16x8  dq,  dl,  dh,  rl,  rh
    vaddl.u8        \dq, \rl, \rh
    vadd.u16        \dl, \dl, \dh
    vpadd.u16       \dl, \dl, \dl
    vpadd.u16       \dl, \dl, \dl
.endm


// because gcc doesn't believe in using the free shift in add
function x264_predict_4x4_h_armv6
    ldrb    r1, [r0, #0*FDEC_STRIDE-1]
    ldrb    r2, [r0, #1*FDEC_STRIDE-1]
    ldrb    r3, [r0, #2*FDEC_STRIDE-1]
    ldrb    ip, [r0, #3*FDEC_STRIDE-1]
    add     r1, r1, r1, lsl #8
    add     r2, r2, r2, lsl #8
    add     r3, r3, r3, lsl #8
    add     ip, ip, ip, lsl #8
    add     r1, r1, r1, lsl #16
    str     r1, [r0, #0*FDEC_STRIDE]
    add     r2, r2, r2, lsl #16
    str     r2, [r0, #1*FDEC_STRIDE]
    add     r3, r3, r3, lsl #16
    str     r3, [r0, #2*FDEC_STRIDE]
    add     ip, ip, ip, lsl #16
    str     ip, [r0, #3*FDEC_STRIDE]
    bx      lr
.endfunc

function x264_predict_4x4_dc_armv6
    mov     ip, #0
    ldr     r1, [r0, #-FDEC_STRIDE]
    ldrb    r2, [r0, #0*FDEC_STRIDE-1]
    ldrb    r3, [r0, #1*FDEC_STRIDE-1]
    usad8   r1, r1, ip
    add     r2, r2, #4
    ldrb    ip, [r0, #2*FDEC_STRIDE-1]
    add     r2, r2, r3
    ldrb    r3, [r0, #3*FDEC_STRIDE-1]
    add     r2, r2, ip
    add     r2, r2, r3
    add     r1, r1, r2
    lsr     r1, r1, #3
    add     r1, r1, r1, lsl #8
    add     r1, r1, r1, lsl #16
    str     r1, [r0, #0*FDEC_STRIDE]
    str     r1, [r0, #1*FDEC_STRIDE]
    str     r1, [r0, #2*FDEC_STRIDE]
    str     r1, [r0, #3*FDEC_STRIDE]
    bx      lr
.endfunc

// return a1 = (a1+2*b1+c1+2)>>2  a2 = (a2+2*b2+c2+2)>>2
.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
    uhadd8  \a1, \a1, \c1
    uhadd8  \a2, \a2, \c2
    uhadd8  \c1, \a1, \b1
    uhadd8  \c2, \a2, \b2
    eor     \a1, \a1, \b1
    eor     \a2, \a2, \b2
    and     \a1, \a1, \pb_1
    and     \a2, \a2, \pb_1
    uadd8   \a1, \a1, \c1
    uadd8   \a2, \a2, \c2
.endm

function x264_predict_4x4_ddr_armv6
    ldr     r1, [r0, # -FDEC_STRIDE]
    ldrb    r2, [r0, # -FDEC_STRIDE-1]
    ldrb    r3, [r0, #0*FDEC_STRIDE-1]
    push    {r4-r6,lr}
    add     r2, r2, r1, lsl #8
    ldrb    r4, [r0, #1*FDEC_STRIDE-1]
    add     r3, r3, r2, lsl #8
    ldrb    r5, [r0, #2*FDEC_STRIDE-1]
    ldrb    r6, [r0, #3*FDEC_STRIDE-1]
    add     r4, r4, r3, lsl #8
    add     r5, r5, r4, lsl #8
    add     r6, r6, r5, lsl #8
    ldr     ip, =0x01010101
    PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
    str     r1, [r0, #0*FDEC_STRIDE]
    lsl     r2, r1, #8
    lsl     r3, r1, #16
    lsl     r4, r4, #8
    lsl     r5, r1, #24
    add     r2, r2, r4, lsr #24
    str     r2, [r0, #1*FDEC_STRIDE]
    add     r3, r3, r4, lsr #16
    str     r3, [r0, #2*FDEC_STRIDE]
    add     r5, r5, r4, lsr #8
    str     r5, [r0, #3*FDEC_STRIDE]
    pop     {r4-r6,pc}
.endfunc

function x264_predict_4x4_ddl_neon
    sub         r0, #FDEC_STRIDE
    mov         ip, #FDEC_STRIDE
    vld1.64     {d0}, [r0], ip
    vdup.8      d3, d0[7]
    vext.8      d1, d0, d0, #1
    vext.8      d2, d0, d3, #2
    vhadd.u8    d0, d0, d2
    vrhadd.u8   d0, d0, d1
    vst1.32     {d0[0]}, [r0,:32], ip
    vext.8      d1, d0, d0, #1
    vext.8      d2, d0, d0, #2
    vst1.32     {d1[0]}, [r0,:32], ip
    vext.8      d3, d0, d0, #3
    vst1.32     {d2[0]}, [r0,:32], ip
    vst1.32     {d3[0]}, [r0,:32], ip
    bx          lr
.endfunc

function x264_predict_8x8_dc_neon
    mov     ip, #0
    ldrd    r2, [r1, #8]
    push    {r4-r5,lr}
    ldrd    r4, [r1, #16]
    lsl     r3, r3, #8
    ldrb    lr, [r1, #7]
    usad8   r2, r2, ip
    usad8   r3, r3, ip
    usada8  r2, r4, ip, r2
    add     lr, lr, #8
    usada8  r3, r5, ip, r3
    add     r2, r2, lr
    mov     ip, #FDEC_STRIDE
    add     r2, r2, r3
    lsr     r2, r2, #4

    vdup.8   d0, r2
.rept 8
    vst1.64 {d0}, [r0,:64], ip
.endr
    pop    {r4-r5,pc}
.endfunc

function x264_predict_8x8_h_neon
    add         r1, r1, #7
    mov         ip, #FDEC_STRIDE
    vld1.64     {d16}, [r1]
    vdup.8      d0, d16[7]
    vdup.8      d1, d16[6]
    vst1.64     {d0}, [r0,:64], ip
    vdup.8      d2, d16[5]
    vst1.64     {d1}, [r0,:64], ip
    vdup.8      d3, d16[4]
    vst1.64     {d2}, [r0,:64], ip
    vdup.8      d4, d16[3]
    vst1.64     {d3}, [r0,:64], ip
    vdup.8      d5, d16[2]
    vst1.64     {d4}, [r0,:64], ip
    vdup.8      d6, d16[1]
    vst1.64     {d5}, [r0,:64], ip
    vdup.8      d7, d16[0]
    vst1.64     {d6}, [r0,:64], ip
    vst1.64     {d7}, [r0,:64], ip
    bx          lr
.endfunc


function x264_predict_8x8c_dc_top_neon
    sub         r2,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    vld1.8      {d0}, [r2,:64]
    vpaddl.u8   d0,  d0
    vpadd.u16   d0,  d0,  d0
    vrshrn.u16  d0,  q0,  #2
    vdup.8      d1,  d0[1]
    vdup.8      d0,  d0[0]
    vtrn.32     d0,  d1
    b           pred8x8_dc_end
    .endfunc

function x264_predict_8x8c_dc_left_neon
    mov         r1,  #FDEC_STRIDE
    sub         r2,  r0,  #1
    ldcol.8     d0,  r2,  r1
    vpaddl.u8   d0,  d0
    vpadd.u16   d0,  d0,  d0
    vrshrn.u16  d0,  q0,  #2
    vdup.8      d1,  d0[1]
    vdup.8      d0,  d0[0]
    b           pred8x8_dc_end
.endfunc

function x264_predict_8x8c_dc_neon
    sub         r2,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    vld1.8      {d0}, [r2,:64]
    sub         r2,  r0,  #1
    ldcol.8     d1,  r2,  r1
    vtrn.32     d0,  d1
    vpaddl.u8   q0,  q0
    vpadd.u16   d0,  d0,  d1
    vpadd.u16   d1,  d0,  d0
    vrshrn.u16  d2,  q0,  #3
    vrshrn.u16  d3,  q0,  #2
    vdup.8      d0,  d2[4]
    vdup.8      d1,  d3[3]
    vdup.8      d4,  d3[2]
    vdup.8      d5,  d2[5]
    vtrn.32     q0,  q2
pred8x8_dc_end:
    add         r2,  r0,  r1,  lsl #2
.rept 4
    vst1.8      {d0}, [r0,:64], r1
    vst1.8      {d1}, [r2,:64], r1
.endr
    bx          lr
.endfunc

function x264_predict_8x8c_h_neon
    sub         r1, r0, #1
    mov         ip, #FDEC_STRIDE
.rept 4
    vld1.8      {d0[]}, [r1], ip
    vld1.8      {d2[]}, [r1], ip
    vst1.64     {d0}, [r0,:64], ip
    vst1.64     {d2}, [r0,:64], ip
.endr
    bx          lr
.endfunc

function x264_predict_8x8c_v_neon
    sub         r0, r0, #FDEC_STRIDE
    mov         ip, #FDEC_STRIDE
    vld1.64     {d0}, [r0,:64], ip
.rept 8
    vst1.64     {d0}, [r0,:64], ip
.endr
    bx          lr
.endfunc

function x264_predict_8x8c_p_neon
    sub         r3,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    add         r2,  r3,  #4
    sub         r3,  r3,  #1
    vld1.32     {d0[0]}, [r3]
    vld1.32     {d2[0]}, [r2,:32], r1
    ldcol.8     d0,  r3,  r1,  4,  hi=1
    add         r3,  r3,  r1
    ldcol.8     d3,  r3,  r1,  4
    vaddl.u8    q8,  d2,  d3
    vrev32.8    d0,  d0
    vtrn.32     d2,  d3
    vsubl.u8    q2,  d2,  d0
    movrel      r3,  p16weight
    vld1.16     {q0}, [r3,:128]
    vmul.s16    d4,  d4,  d0
    vmul.s16    d5,  d5,  d0
    vpadd.i16   d4,  d4,  d5
    vpaddl.s16  d4,  d4
    vshl.i32    d5,  d4,  #4
    vadd.s32    d4,  d4,  d5
    vrshrn.s32  d4,  q2,  #5
    mov         r3,  #0
    vtrn.16     d4,  d5
    vadd.i16    d2,  d4,  d5
    vshl.i16    d3,  d2,  #2
    vrev64.16   d16, d16
    vsub.i16    d3,  d3,  d2
    vadd.i16    d16, d16, d0
    vshl.i16    d2,  d16, #4
    vsub.i16    d2,  d2,  d3
    vshl.i16    d3,  d4,  #3
    vext.16     q0,  q0,  q0,  #7
    vsub.i16    d6,  d5,  d3
    vmov.16     d0[0], r3
    vmul.i16    q0,  q0,  d4[0]
    vdup.16     q1,  d2[0]
    vdup.16     q2,  d4[0]
    vdup.16     q3,  d6[0]
    vshl.i16    q2,  q2,  #3
    vadd.i16    q1,  q1,  q0
    vadd.i16    q3,  q3,  q2
    mov         r3,  #8
1:
    vqshrun.s16 d0,  q1,  #5
    vadd.i16    q1,  q1,  q3
    vst1.8      {d0}, [r0,:64], r1
    subs        r3,  r3,  #1
    bne         1b
    bx          lr
.endfunc


function x264_predict_16x16_dc_top_neon
    sub         r2,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    vld1.8      {q0}, [r2,:128]
    add16x8     q0,  d0,  d1,  d0,  d1
    vrshrn.u16  d0,  q0,  #4
    vdup.8      q0,  d0[0]
    b           pred16x16_dc_end
.endfunc

function x264_predict_16x16_dc_left_neon
    mov         r1,  #FDEC_STRIDE
    sub         r2,  r0,  #1
    ldcol.8     d0,  r2,  r1
    ldcol.8     d1,  r2,  r1
    add16x8     q0,  d0,  d1,  d0,  d1
    vrshrn.u16  d0,  q0,  #4
    vdup.8      q0,  d0[0]
    b           pred16x16_dc_end
.endfunc

function x264_predict_16x16_dc_neon
    sub         r3, r0, #FDEC_STRIDE
    sub         r0, r0, #1
    vld1.64     {d0-d1}, [r3,:128]
    ldrb        ip, [r0], #FDEC_STRIDE
    vaddl.u8    q0, d0, d1
    ldrb        r1, [r0], #FDEC_STRIDE
    vadd.u16    d0, d0, d1
    vpadd.u16   d0, d0, d0
    vpadd.u16   d0, d0, d0
.rept 4
    ldrb        r2, [r0], #FDEC_STRIDE
    add         ip, ip, r1
    ldrb        r3, [r0], #FDEC_STRIDE
    add         ip, ip, r2
    ldrb        r1, [r0], #FDEC_STRIDE
    add         ip, ip, r3
.endr
    ldrb        r2, [r0], #FDEC_STRIDE
    add         ip, ip, r1
    ldrb        r3, [r0], #FDEC_STRIDE
    add         ip, ip, r2

    sub         r0, r0, #FDEC_STRIDE*16
    add         ip, ip, r3
    vdup.16     d1, ip
    vadd.u16    d0, d0, d1
    mov         r1, #FDEC_STRIDE
    add         r0, r0, #1
    vrshr.u16   d0, d0, #5
    vdup.8      q0, d0[0]
pred16x16_dc_end:
.rept 16
    vst1.64     {d0-d1}, [r0,:128], r1
.endr
    bx          lr
.endfunc

function x264_predict_16x16_h_neon
    sub         r1, r0, #1
    mov         ip, #FDEC_STRIDE
.rept 8
    vld1.8      {d0[]}, [r1], ip
    vmov        d1, d0
    vld1.8      {d2[]}, [r1], ip
    vmov        d3, d2
    vst1.64     {d0-d1}, [r0,:128], ip
    vst1.64     {d2-d3}, [r0,:128], ip
.endr
    bx          lr
.endfunc

function x264_predict_16x16_v_neon
    sub         r0, r0, #FDEC_STRIDE
    mov         ip, #FDEC_STRIDE
    vld1.64     {d0-d1}, [r0,:128], ip
.rept 16
    vst1.64     {d0-d1}, [r0,:128], ip
.endr
    bx          lr
.endfunc

function x264_predict_16x16_p_neon
    sub         r3,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    add         r2,  r3,  #8
    sub         r3,  r3,  #1
    vld1.8      {d0}, [r3]
    vld1.8      {d2}, [r2,:64], r1
    ldcol.8     d1,  r3,  r1
    add         r3,  r3,  r1
    ldcol.8     d3,  r3,  r1
    vrev64.8    q0,  q0
    vaddl.u8    q8,  d2,  d3
    vsubl.u8    q2,  d2,  d0
    vsubl.u8    q3,  d3,  d1
    movrel      r3,  p16weight
    vld1.8      {q0}, [r3,:128]
    vmul.s16    q2,  q2,  q0
    vmul.s16    q3,  q3,  q0
    vadd.i16    d4,  d4,  d5
    vadd.i16    d5,  d6,  d7
    vpadd.i16   d4,  d4,  d5
    vpadd.i16   d4,  d4,  d4
    vshll.s16   q3,  d4,  #2
    vaddw.s16   q2,  q3,  d4
    vrshrn.s32  d4,  q2,  #6
    mov         r3,  #0
    vtrn.16     d4,  d5
    vadd.i16    d2,  d4,  d5
    vshl.i16    d3,  d2,  #3
    vrev64.16   d16, d17
    vsub.i16    d3,  d3,  d2
    vadd.i16    d16, d16, d0
    vshl.i16    d2,  d16, #4
    vsub.i16    d2,  d2,  d3
    vshl.i16    d3,  d4,  #4
    vext.16     q0,  q0,  q0,  #7
    vsub.i16    d6,  d5,  d3
    vmov.16     d0[0], r3
    vmul.i16    q0,  q0,  d4[0]
    vdup.16     q1,  d2[0]
    vdup.16     q2,  d4[0]
    vdup.16     q3,  d6[0]
    vshl.i16    q2,  q2,  #3
    vadd.i16    q1,  q1,  q0
    vadd.i16    q3,  q3,  q2
    mov         r3,  #16
1:
    vqshrun.s16 d0,  q1,  #5
    vadd.i16    q1,  q1,  q2
    vqshrun.s16 d1,  q1,  #5
    vadd.i16    q1,  q1,  q3
    vst1.8      {q0}, [r0,:128], r1
    subs        r3,  r3,  #1
    bne         1b
    bx          lr
.endfunc
x264-snapshot-20120103-2245-stable/common/arm/pixel.h0000644000175000001440000000604211700673342021234 0ustar  videolanusers/*****************************************************************************
 * pixel.h: arm pixel metrics
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ARM_PIXEL_H
#define X264_ARM_PIXEL_H

#define DECL_PIXELS( ret, name, suffix, args ) \
    ret x264_pixel_##name##_16x16_##suffix args;\
    ret x264_pixel_##name##_16x8_##suffix args;\
    ret x264_pixel_##name##_8x16_##suffix args;\
    ret x264_pixel_##name##_8x8_##suffix args;\
    ret x264_pixel_##name##_8x4_##suffix args;\
    ret x264_pixel_##name##_4x8_##suffix args;\
    ret x264_pixel_##name##_4x4_##suffix args;\

#define DECL_X1( name, suffix ) \
    DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )

#define DECL_X4( name, suffix ) \
    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\
    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )

int x264_pixel_sad_4x4_armv6( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x8_armv6( uint8_t *, int, uint8_t *, int );

DECL_X1( sad, neon )
DECL_X1( sad_aligned, neon )
DECL_X1( sad_aligned, neon_dual )
DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )

int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );

uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );

uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
uint64_t x264_pixel_hadamard_ac_8x16_neon( uint8_t *, int );
uint64_t x264_pixel_hadamard_ac_16x8_neon( uint8_t *, int );
uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, int );

void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,
                                      const uint8_t *, int,
                                      int sums[2][4]);
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );

#endif
x264-snapshot-20120103-2245-stable/common/arm/pixel-a.S0000644000175000001440000007701111700673342021431 0ustar  videolanusers/*****************************************************************************
 * pixel.S: arm pixel metrics
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "asm.S"

.fpu neon
.section .rodata
.align 4

.rept 16 .byte 0xff
.endr
mask_ff:
.rept 16 .byte 0
.endr

mask_ac4:
.short 0, -1, -1, -1,  0, -1, -1, -1
mask_ac8:
.short 0, -1, -1, -1, -1, -1, -1, -1

.text

.macro SAD4_ARMV6 h
function x264_pixel_sad_4x\h\()_armv6
    push        {r4-r6,lr}
    ldr         r4, [r2], r3
    ldr         r5, [r0], r1
    ldr         r6, [r2], r3
    ldr         lr, [r0], r1
    usad8       ip, r4, r5
.rept (\h - 2)/2
    ldr         r4, [r2], r3
    ldr         r5, [r0], r1
    usada8      ip, r6, lr, ip
    ldr         r6, [r2], r3
    ldr         lr, [r0], r1
    usada8      ip, r4, r5, ip
.endr
    usada8      r0, r6, lr, ip
    pop         {r4-r6,pc}
.endfunc
.endm

SAD4_ARMV6 4
SAD4_ARMV6 8


.macro SAD_START_4 align:vararg
    vld1.32     {d1[]}, [r2 \align], r3
    vld1.32     {d0[]}, [r0,:32], r1
    vabdl.u8    q8,  d0,  d1
.endm

.macro SAD_4 align:vararg
    vld1.32     {d1[]}, [r2 \align], r3
    vld1.32     {d0[]}, [r0,:32], r1
    vabal.u8    q8,  d0,  d1
.endm

.macro SAD_START_8 align:vararg
    vld1.64     {d1}, [r2 \align], r3
    vld1.64     {d0}, [r0,:64], r1
    vabdl.u8    q8,  d0,  d1
.endm

.macro SAD_8 align:vararg
    vld1.64     {d1}, [r2 \align], r3
    vld1.64     {d0}, [r0,:64], r1
    vabal.u8    q8,  d0,  d1
.endm

.macro SAD_START_16 align:vararg
    vld1.64     {d2-d3}, [r2 \align], r3
    vld1.64     {d0-d1}, [r0,:128], r1
    vabdl.u8    q8,  d0,  d2
    vld1.64     {d6-d7}, [r2 \align], r3
    vabdl.u8    q9,  d1,  d3
    vld1.64     {d4-d5}, [r0,:128], r1
.endm

.macro SAD_16 align:vararg
    vabal.u8    q8,  d4,  d6
    vld1.64     {d2-d3}, [r2 \align], r3
    vabal.u8    q9,  d5,  d7
    vld1.64     {d0-d1}, [r0,:128], r1
    vabal.u8    q8,  d0,  d2
    vld1.64     {d6-d7}, [r2 \align], r3
    vabal.u8    q9,  d1,  d3
    vld1.64     {d4-d5}, [r0,:128], r1
.endm

.macro SAD_FUNC w, h, name, align:vararg
function x264_pixel_sad\name\()_\w\()x\h\()_neon
    SAD_START_\w \align

.if \w == 16
.rept \h / 2 - 1
    SAD_\w \align
.endr
.else
.rept \h - 1
    SAD_\w \align
.endr
.endif

.if \w > 8
    vabal.u8    q8,  d4,  d6
    vabal.u8    q9,  d5,  d7
    vadd.u16    q8,  q8,  q9
.endif
.if \w > 4
    vadd.u16    d16, d16, d17
.endif
    vpadd.u16   d0,  d16, d16
    vpaddl.u16  d0,  d0
    vmov.u32    r0,  d0[0]
    bx          lr
.endfunc
.endm

SAD_FUNC  4,  4
SAD_FUNC  4,  8
SAD_FUNC  8,  4
SAD_FUNC  8,  8
SAD_FUNC  8,  16
SAD_FUNC  16, 8
SAD_FUNC  16, 16

SAD_FUNC  4,  4,  _aligned, ,:32
SAD_FUNC  4,  8,  _aligned, ,:32
SAD_FUNC  8,  4,  _aligned, ,:64
SAD_FUNC  8,  8,  _aligned, ,:64
SAD_FUNC  8,  16, _aligned, ,:64
SAD_FUNC  16, 8,  _aligned, ,:128
SAD_FUNC  16, 16, _aligned, ,:128

// If dual issue is possible, use additional accumulators to avoid
// stalls from vadal's latency. This only matters for aligned.
.macro SAD_DUAL_START_8
    SAD_START_8 ,:64
    vld1.64     {d3}, [r2,:64], r3
    vld1.64     {d2}, [r0,:64], r1
    vabdl.u8    q9,  d2,  d3
.endm

.macro SAD_DUAL_8 align:vararg
    vld1.64     {d1}, [r2,:64], r3
    vld1.64     {d0}, [r0,:64], r1
    vabal.u8    q8,  d0,  d1
    vld1.64     {d3}, [r2,:64], r3
    vld1.64     {d2}, [r0,:64], r1
    vabal.u8    q9,  d2,  d3
.endm

.macro SAD_DUAL_START_16
    SAD_START_16 ,:128
    vabdl.u8    q10, d4,  d6
    vld1.64     {d2-d3}, [r2,:128], r3
    vabdl.u8    q11, d5,  d7
    vld1.64     {d0-d1}, [r0,:128], r1
.endm

.macro SAD_DUAL_16
    vabal.u8    q8,  d0,  d2
    vld1.64     {d6-d7}, [r2,:128], r3
    vabal.u8    q9,  d1,  d3
    vld1.64     {d4-d5}, [r0,:128], r1
    vabal.u8    q10, d4,  d6
    vld1.64     {d2-d3}, [r2,:128], r3
    vabal.u8    q11, d5,  d7
    vld1.64     {d0-d1}, [r0,:128], r1
.endm

.macro SAD_DUAL_END_16
    vabal.u8    q8,  d0,  d2
    vld1.64     {d6-d7}, [r2,:128], r3
    vabal.u8    q9,  d1,  d3
    vld1.64     {d4-d5}, [r0,:128], r1
    vabal.u8    q10, d4,  d6
    vabal.u8    q11, d5,  d7
.endm

.macro SAD_FUNC_DUAL w, h
function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual
    SAD_DUAL_START_\w
.rept \h / 2 - \w / 8
    SAD_DUAL_\w
.endr

.if \w > 8
    SAD_DUAL_END_16
    vadd.u16    q8,  q8,  q9
    vadd.u16    q9,  q10, q11
.endif
.if \w > 4
    vadd.u16    q8,  q8,  q9
    vadd.u16    d16, d16, d17
.endif
    vpadd.u16   d0,  d16, d16
    vpaddl.u16  d0,  d0
    vmov.u32    r0,  d0[0]
    bx          lr
.endfunc
.endm

SAD_FUNC_DUAL  8,  4
SAD_FUNC_DUAL  8,  8
SAD_FUNC_DUAL  8,  16
SAD_FUNC_DUAL  16, 8
SAD_FUNC_DUAL  16, 16


.macro SAD_X_START_4 x
    vld1.32     {d0[]}, [r0,:32], lr
    vld1.32     {d1[]}, [r1], r6
    vabdl.u8    q8,  d1,  d0
    vld1.32     {d2[]}, [r2], r6
    vabdl.u8    q9,  d2,  d0
    vld1.32     {d3[]}, [r3], r6
    vabdl.u8    q10, d3,  d0
.if \x == 4
    vld1.32     {d4[]}, [r12], r6
    vabdl.u8    q11, d4,  d0
.endif
.endm

.macro SAD_X_4 x
    vld1.32     {d0[]}, [r0,:32], lr
    vld1.32     {d1[]}, [r1], r6
    vabal.u8    q8,  d1,  d0
    vld1.32     {d2[]}, [r2], r6
    vabal.u8    q9,  d2,  d0
    vld1.32     {d3[]}, [r3], r6
    vabal.u8    q10, d3,  d0
.if \x == 4
    vld1.32     {d4[]}, [r12], r6
    vabal.u8    q11, d4,  d0
.endif
.endm

.macro SAD_X_START_8 x
    vld1.64     {d0}, [r0,:64], lr
    vld1.64     {d1}, [r1], r6
    vabdl.u8    q8,  d1,  d0
    vld1.64     {d2}, [r2], r6
    vabdl.u8    q9,  d2,  d0
    vld1.64     {d3}, [r3], r6
    vabdl.u8    q10, d3,  d0
.if \x == 4
    vld1.64     {d4}, [r12], r6
    vabdl.u8    q11, d4,  d0
.endif
.endm

.macro SAD_X_8 x
    vld1.64     {d0}, [r0,:64], lr
    vld1.64     {d1}, [r1], r6
    vabal.u8    q8,  d1,  d0
    vld1.64     {d2}, [r2], r6
    vabal.u8    q9,  d2,  d0
    vld1.64     {d3}, [r3], r6
    vabal.u8    q10, d3,  d0
.if \x == 4
    vld1.64     {d4}, [r12], r6
    vabal.u8    q11, d4,  d0
.endif
.endm

.macro SAD_X_START_16 x
    vld1.64     {d0-d1}, [r0,:128], lr
    vld1.64     {d2-d3}, [r1], r6
    vabdl.u8    q8,  d2,  d0
    vabdl.u8    q12, d3,  d1
    vld1.64     {d4-d5}, [r2], r6
    vabdl.u8    q9,  d4,  d0
    vabdl.u8    q13, d5,  d1
    vld1.64     {d6-d7}, [r3], r6
    vabdl.u8    q10, d6,  d0
    vabdl.u8    q14, d7,  d1
.if \x == 4
    vld1.64     {d2-d3}, [r12], r6
    vabdl.u8    q11, d2,  d0
    vabdl.u8    q15, d3,  d1
.endif
.endm

.macro SAD_X_16 x
    vld1.64     {d0-d1}, [r0,:128], lr
    vld1.64     {d2-d3}, [r1], r6
    vabal.u8    q8,  d2,  d0
    vabal.u8    q12, d3,  d1
    vld1.64     {d4-d5}, [r2], r6
    vabal.u8    q9,  d4,  d0
    vabal.u8    q13, d5,  d1
    vld1.64     {d6-d7}, [r3], r6
    vabal.u8    q10, d6,  d0
    vabal.u8    q14, d7,  d1
.if \x == 4
    vld1.64     {d2-d3}, [r12], r6
    vabal.u8    q11, d2,  d0
    vabal.u8    q15, d3,  d1
.endif
.endm

.macro SAD_X_FUNC x, w, h
function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
    push        {r6-r7,lr}
.if \x == 3
    ldrd        r6,  [sp, #12]
.else
    ldrd        r6,  [sp, #16]
    ldr         r12, [sp, #12]
.endif
    mov         lr,  #FENC_STRIDE

    SAD_X_START_\w \x
.rept \h - 1
    SAD_X_\w \x
.endr

// add up the sads
.if \w > 8
    vadd.u16    q8,  q8,  q12
    vadd.u16    q9,  q9,  q13
    vadd.u16    q10, q10, q14
.if \x == 4
    vadd.u16    q11, q11, q15
.endif
.endif
.if \w > 4
    vadd.u16    d16, d16, d17
    vadd.u16    d18, d18, d19
    vadd.u16    d20, d20, d21
.if \x == 4
    vadd.u16    d22, d22, d23
.endif
.endif
    vpadd.u16   d0,  d16, d18
    vpadd.u16   d1,  d20, d22
    vpaddl.u16  q0,  q0

.if \x == 3
    vst1.32     {d0},    [r7]!
    vst1.32     {d1[0]}, [r7,:32]
.else
    vst1.32     {d0-d1}, [r7]
.endif
    pop         {r6-r7,pc}
.endfunc
.endm

SAD_X_FUNC  3, 4,  4
SAD_X_FUNC  3, 4,  8
SAD_X_FUNC  3, 8,  4
SAD_X_FUNC  3, 8,  8
SAD_X_FUNC  3, 8,  16
SAD_X_FUNC  3, 16, 8
SAD_X_FUNC  3, 16, 16

SAD_X_FUNC  4, 4,  4
SAD_X_FUNC  4, 4,  8
SAD_X_FUNC  4, 8,  4
SAD_X_FUNC  4, 8,  8
SAD_X_FUNC  4, 8,  16
SAD_X_FUNC  4, 16, 8
SAD_X_FUNC  4, 16, 16


.macro SSD_START_4
    vld1.32     {d16[]}, [r0,:32], r1
    vld1.32     {d17[]}, [r2,:32], r3
    vsubl.u8    q2, d16, d17
    vld1.32     {d16[]}, [r0,:32], r1
    vmull.s16   q0, d4, d4
    vld1.32     {d17[]}, [r2,:32], r3
.endm

.macro SSD_4
    vsubl.u8    q2, d16, d17
    vld1.32     {d16[]}, [r0,:32], r1
    vmlal.s16   q0, d4, d4
    vld1.32     {d17[]}, [r2,:32], r3
.endm

.macro SSD_END_4
    vsubl.u8    q2, d16, d17
    vmlal.s16   q0, d4, d4
.endm

.macro SSD_START_8
    vld1.64     {d16}, [r0,:64], r1
    vld1.64     {d17}, [r2,:64], r3
    vsubl.u8    q2, d16, d17
    vld1.64     {d16}, [r0,:64], r1
    vmull.s16   q0, d4, d4
    vmlal.s16   q0, d5, d5
    vld1.64     {d17}, [r2,:64], r3
.endm

.macro SSD_8
    vsubl.u8    q2, d16, d17
    vld1.64     {d16}, [r0,:64], r1
    vmlal.s16   q0, d4, d4
    vmlal.s16   q0, d5, d5
    vld1.64     {d17}, [r2,:64], r3
.endm

.macro SSD_END_8
    vsubl.u8    q2, d16, d17
    vmlal.s16   q0, d4, d4
    vmlal.s16   q0, d5, d5
.endm

.macro SSD_START_16
    vld1.64     {d16-d17}, [r0,:128], r1
    vld1.64     {d18-d19}, [r2,:128], r3
    vsubl.u8    q2, d16, d18
    vsubl.u8    q3, d17, d19
    vld1.64     {d16-d17}, [r0,:128], r1
    vmull.s16   q0, d4, d4
    vmlal.s16   q0, d5, d5
    vld1.64     {d18-d19}, [r2,:128], r3
    vmlal.s16   q0, d6, d6
    vmlal.s16   q0, d7, d7
.endm

.macro SSD_16
    vsubl.u8    q2, d16, d18
    vsubl.u8    q3, d17, d19
    vld1.64     {d16-d17}, [r0,:128], r1
    vmlal.s16   q0, d4, d4
    vmlal.s16   q0, d5, d5
    vld1.64     {d18-d19}, [r2,:128], r3
    vmlal.s16   q0, d6, d6
    vmlal.s16   q0, d7, d7
.endm

.macro SSD_END_16
    vsubl.u8    q2, d16, d18
    vsubl.u8    q3, d17, d19
    vmlal.s16   q0, d4, d4
    vmlal.s16   q0, d5, d5
    vmlal.s16   q0, d6, d6
    vmlal.s16   q0, d7, d7
.endm

.macro SSD_FUNC w h
function x264_pixel_ssd_\w\()x\h\()_neon
    SSD_START_\w
.rept \h-2
    SSD_\w
.endr
    SSD_END_\w
    vadd.s32    d0, d0, d1
    vpadd.s32   d0, d0, d0
    vmov.32     r0, d0[0]
    bx          lr
.endfunc
.endm

SSD_FUNC   4, 4
SSD_FUNC   4, 8
SSD_FUNC   8, 4
SSD_FUNC   8, 8
SSD_FUNC   8, 16
SSD_FUNC  16, 8
SSD_FUNC  16, 16


.macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16
    vmull.u8        \qsqr, \dsrc, \dsrc
    vaddw.u8        q0, q0, \dsrc
    \vpadal         \qsqr_sum, \qsqr_last
.endm

function x264_pixel_var_8x8_neon
    vld1.64         {d16}, [r0,:64], r1
    vmull.u8        q1,  d16, d16
    vmovl.u8        q0,  d16
    vld1.64         {d18}, [r0,:64], r1
    vmull.u8        q2,  d18, d18
    vaddw.u8        q0,  q0,  d18

    vld1.64         {d20}, [r0,:64], r1
    VAR_SQR_SUM     q1,  q1,   q3,  d20, vpaddl.u16
    vld1.64         {d22}, [r0,:64], r1
    VAR_SQR_SUM     q2,  q2,   q8,  d22, vpaddl.u16

    vld1.64         {d24}, [r0,:64], r1
    VAR_SQR_SUM     q1,  q3,   q9,  d24
    vld1.64         {d26}, [r0,:64], r1
    VAR_SQR_SUM     q2,  q8,   q10, d26
    vld1.64         {d24}, [r0,:64], r1
    VAR_SQR_SUM     q1,  q9,   q14, d24
    vld1.64         {d26}, [r0,:64], r1
    VAR_SQR_SUM     q2,  q10,  q15, d26
    b               x264_var_end
.endfunc

function x264_pixel_var_16x16_neon
    vld1.64         {d16-d17}, [r0,:128], r1
    vmull.u8        q12, d16, d16
    vmovl.u8        q0,  d16
    vmull.u8        q13, d17, d17
    vaddw.u8        q0,  q0,  d17

    vld1.64         {d18-d19}, [r0,:128], r1
    VAR_SQR_SUM     q1,  q12,  q14, d18, vpaddl.u16
    VAR_SQR_SUM     q2,  q13,  q15, d19, vpaddl.u16

    mov             ip,  #7
var16_loop:
    subs            ip,  ip,  #1
    vld1.64         {d16-d17}, [r0,:128], r1
    VAR_SQR_SUM     q1,  q14,  q12, d16
    VAR_SQR_SUM     q2,  q15,  q13, d17

    vld1.64         {d18-d19}, [r0,:128], r1
    VAR_SQR_SUM     q1,  q12,  q14, d18
    VAR_SQR_SUM     q2,  q13,  q15, d19
    bgt             var16_loop
.endfunc

function x264_var_end
    vpaddl.u16      q8,  q14
    vpaddl.u16      q9,  q15
    vadd.u32        q1,  q1,  q8
    vadd.u16        d0,  d0,  d1
    vadd.u32        q1,  q1,  q9
    vadd.u32        q1,  q1,  q2
    vpaddl.u16      d0,  d0
    vadd.u32        d2,  d2,  d3
    vpadd.u32       d0,  d0,  d2

    vmov            r0,  r1,  d0
    bx              lr
.endfunc

.macro DIFF_SUM diff da db lastdiff
    vld1.64         {\da}, [r0,:64], r1
    vld1.64         {\db}, [r2,:64], r3
.ifnb \lastdiff
    vadd.s16        q0,  q0,  \lastdiff
.endif
    vsubl.u8        \diff, \da, \db
.endm

.macro SQR_ACC acc d0 d1 vmlal=vmlal.s16
    \vmlal          \acc, \d0, \d0
    vmlal.s16       \acc, \d1, \d1
.endm

function x264_pixel_var2_8x8_neon
    DIFF_SUM        q0,  d0,  d1
    DIFF_SUM        q8,  d16, d17
    SQR_ACC         q1,  d0,  d1,  vmull.s16
    DIFF_SUM        q9,  d18, d19, q8
    SQR_ACC         q2,  d16, d17, vmull.s16
.rept 2
    DIFF_SUM        q8,  d16, d17, q9
    SQR_ACC         q1,  d18, d19
    DIFF_SUM        q9,  d18, d19, q8
    SQR_ACC         q2,  d16, d17
.endr
    DIFF_SUM        q8,  d16, d17, q9
    SQR_ACC         q1,  d18, d19
    vadd.s16        q0,  q0,  q8
    SQR_ACC         q2,  d16, d17

    ldr             ip,  [sp]
    vadd.s16        d0,  d0,  d1
    vadd.s32        q1,  q1,  q2
    vpaddl.s16      d0,  d0
    vadd.s32        d1,  d2,  d3
    vpadd.s32       d0,  d0,  d1

    vmov.32         r0,  r1,  d0
    vst1.32         {d0[1]}, [ip,:32]
    mul             r0,  r0,  r0
    sub             r0,  r1,  r0,  lsr #6
    bx              lr
.endfunc


.macro LOAD_DIFF_8x4 q0 q1 q2 q3
    vld1.32     {d1}, [r2], r3
    vld1.32     {d0}, [r0,:64], r1
    vsubl.u8    \q0, d0,  d1
    vld1.32     {d3}, [r2], r3
    vld1.32     {d2}, [r0,:64], r1
    vsubl.u8    \q1, d2,  d3
    vld1.32     {d5}, [r2], r3
    vld1.32     {d4}, [r0,:64], r1
    vsubl.u8    \q2, d4,  d5
    vld1.32     {d7}, [r2], r3
    vld1.32     {d6}, [r0,:64], r1
    vsubl.u8    \q3, d6,  d7
.endm

function x264_pixel_satd_4x4_neon
    vld1.32     {d1[]},  [r2], r3
    vld1.32     {d0[]},  [r0,:32], r1
    vld1.32     {d3[]},  [r2], r3
    vld1.32     {d2[]},  [r0,:32], r1
    vld1.32     {d1[1]}, [r2], r3
    vld1.32     {d0[1]}, [r0,:32], r1
    vld1.32     {d3[1]}, [r2], r3
    vld1.32     {d2[1]}, [r0,:32], r1
    vsubl.u8    q0,  d0,  d1
    vsubl.u8    q1,  d2,  d3

    SUMSUB_AB   q2, q3, q0, q1
    SUMSUB_ABCD d0, d2, d1, d3, d4, d5, d6, d7
    HADAMARD    1, sumsub, q2, q3, q0, q1
    HADAMARD    2, amax,   q0,,    q2, q3

    HORIZ_ADD   d0,  d0,  d1
    vmov.32     r0,  d0[0]
    bx          lr
.endfunc

function x264_pixel_satd_4x8_neon
    vld1.32     {d1[]},  [r2], r3
    vld1.32     {d0[]},  [r0,:32], r1
    vld1.32     {d3[]},  [r2], r3
    vld1.32     {d2[]},  [r0,:32], r1
    vld1.32     {d5[]},  [r2], r3
    vld1.32     {d4[]},  [r0,:32], r1
    vld1.32     {d7[]},  [r2], r3
    vld1.32     {d6[]},  [r0,:32], r1

    vld1.32     {d1[1]}, [r2], r3
    vld1.32     {d0[1]}, [r0,:32], r1
    vsubl.u8    q0,  d0,  d1
    vld1.32     {d3[1]}, [r2], r3
    vld1.32     {d2[1]}, [r0,:32], r1
    vsubl.u8    q1,  d2,  d3
    vld1.32     {d5[1]}, [r2], r3
    vld1.32     {d4[1]}, [r0,:32], r1
    vsubl.u8    q2,  d4,  d5
    vld1.32     {d7[1]}, [r2], r3
    SUMSUB_AB   q8,  q9,  q0,  q1
    vld1.32     {d6[1]}, [r0,:32], r1
    vsubl.u8    q3,  d6,  d7
    SUMSUB_AB   q10, q11, q2,  q3
    b           x264_satd_4x8_8x4_end_neon
.endfunc

function x264_pixel_satd_8x4_neon
    vld1.64     {d1}, [r2], r3
    vld1.64     {d0}, [r0,:64], r1
    vsubl.u8    q0,  d0,  d1
    vld1.64     {d3}, [r2], r3
    vld1.64     {d2}, [r0,:64], r1
    vsubl.u8    q1,  d2,  d3
    vld1.64     {d5}, [r2], r3
    vld1.64     {d4}, [r0,:64], r1
    vsubl.u8    q2,  d4,  d5
    vld1.64     {d7}, [r2], r3
    SUMSUB_AB   q8,  q9,  q0,  q1
    vld1.64     {d6}, [r0,:64], r1
    vsubl.u8    q3,  d6,  d7
    SUMSUB_AB   q10, q11, q2,  q3
.endfunc

function x264_satd_4x8_8x4_end_neon
    vadd.s16    q0,  q8,  q10
    vadd.s16    q1,  q9,  q11
    vsub.s16    q2,  q8,  q10
    vsub.s16    q3,  q9,  q11

    vtrn.16     q0,  q1
    vadd.s16    q8,  q0,  q1
    vtrn.16     q2,  q3
    vsub.s16    q9,  q0,  q1
    vadd.s16    q10, q2,  q3
    vsub.s16    q11, q2,  q3
    vtrn.32     q8,  q10
    vabs.s16    q8,  q8
    vtrn.32     q9,  q11
    vabs.s16    q10, q10
    vabs.s16    q9,  q9
    vabs.s16    q11, q11
    vmax.u16    q0,  q8,  q10
    vmax.u16    q1,  q9,  q11

    vadd.u16    q0,  q0,  q1
    HORIZ_ADD   d0,  d0,  d1
    vmov.32     r0,  d0[0]
    bx          lr
.endfunc

function x264_pixel_satd_8x8_neon
    mov         ip,  lr

    bl x264_satd_8x8_neon
    vadd.u16    q0,  q12, q13
    vadd.u16    q1,  q14, q15

    vadd.u16    q0,  q0,  q1
    HORIZ_ADD   d0,  d0,  d1
    mov         lr,  ip
    vmov.32     r0,  d0[0]
    bx          lr
.endfunc

function x264_pixel_satd_8x16_neon
    vpush       {d8-d11}
    mov         ip,  lr

    bl x264_satd_8x8_neon
    vadd.u16    q4,  q12, q13
    vadd.u16    q5,  q14, q15

    bl x264_satd_8x8_neon
    vadd.u16    q4,  q4,  q12
    vadd.u16    q5,  q5,  q13
    vadd.u16    q4,  q4,  q14
    vadd.u16    q5,  q5,  q15

    vadd.u16    q0,  q4,  q5
    HORIZ_ADD   d0,  d0,  d1
    vpop        {d8-d11}
    mov         lr,  ip
    vmov.32     r0,  d0[0]
    bx          lr
.endfunc

function x264_satd_8x8_neon
    LOAD_DIFF_8x4 q8,  q9,  q10, q11
    vld1.64     {d7}, [r2], r3
    SUMSUB_AB   q0,  q1,  q8,  q9
    vld1.64     {d6}, [r0,:64], r1
    vsubl.u8    q12, d6,  d7
    vld1.64     {d17}, [r2], r3
    SUMSUB_AB   q2,  q3,  q10, q11
    vld1.64     {d16}, [r0,:64], r1
    vsubl.u8    q13, d16, d17
    vld1.64     {d19}, [r2], r3
    SUMSUB_AB   q8,  q10, q0,  q2
    vld1.64     {d18}, [r0,:64], r1
    vsubl.u8    q14, d18, d19
    vld1.64     {d1}, [r2], r3
    SUMSUB_AB   q9,  q11, q1,  q3
    vld1.64     {d0}, [r0,:64], r1
    vsubl.u8    q15, d0,  d1
.endfunc

// one vertical hadamard pass and two horizontal
function x264_satd_8x4v_8x8h_neon
    SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15
    vtrn.16     q8,  q9
    SUMSUB_AB   q12, q14, q0,  q2
    vtrn.16     q10, q11
    SUMSUB_AB   q13, q15, q1,  q3
    SUMSUB_AB   q0,  q1,  q8,  q9
    vtrn.16     q12, q13
    SUMSUB_AB   q2,  q3,  q10, q11
    vtrn.16     q14, q15
    SUMSUB_AB   q8,  q9,  q12, q13
    vtrn.32     q0,  q2
    SUMSUB_AB   q10, q11, q14, q15

    vtrn.32     q1,  q3
    ABS2        q0,  q2
    vtrn.32     q8,  q10
    ABS2        q1,  q3
    vtrn.32     q9,  q11
    ABS2        q8,  q10
    ABS2        q9,  q11
    vmax.s16    q12, q0,  q2
    vmax.s16    q13, q1,  q3
    vmax.s16    q14, q8,  q10
    vmax.s16    q15, q9,  q11
    bx          lr
.endfunc

function x264_pixel_satd_16x8_neon
    vpush       {d8-d11}
    mov         ip, lr

    bl          x264_satd_16x4_neon
    vadd.u16    q4,  q12, q13
    vadd.u16    q5,  q14, q15

    bl          x264_satd_16x4_neon
    vadd.u16    q4,  q4,  q12
    vadd.u16    q5,  q5,  q13
    vadd.u16    q4,  q4,  q14
    vadd.u16    q5,  q5,  q15

    vadd.u16    q0,  q4,  q5
    HORIZ_ADD   d0,  d0,  d1
    vpop        {d8-d11}
    mov         lr,  ip
    vmov.32     r0,  d0[0]
    bx          lr
.endfunc

function x264_pixel_satd_16x16_neon
    vpush       {d8-d11}
    mov         ip, lr

    bl          x264_satd_16x4_neon
    vadd.u16    q4,  q12, q13
    vadd.u16    q5,  q14, q15

    bl          x264_satd_16x4_neon
    vadd.u16    q4,  q4,  q12
    vadd.u16    q5,  q5,  q13
    vadd.u16    q4,  q4,  q14
    vadd.u16    q5,  q5,  q15

    bl          x264_satd_16x4_neon
    vadd.u16    q4,  q4,  q12
    vadd.u16    q5,  q5,  q13
    vadd.u16    q4,  q4,  q14
    vadd.u16    q5,  q5,  q15

    bl          x264_satd_16x4_neon
    vadd.u16    q4,  q4,  q12
    vadd.u16    q5,  q5,  q13
    vadd.u16    q4,  q4,  q14
    vadd.u16    q5,  q5,  q15

    vadd.u16    q0,  q4,  q5
    HORIZ_ADD   d0,  d0,  d1
    vpop        {d8-d11}
    mov         lr,  ip
    vmov.32     r0,  d0[0]
    bx          lr
.endfunc

function x264_satd_16x4_neon
    vld1.64     {d2-d3}, [r2], r3
    vld1.64     {d0-d1}, [r0,:128], r1
    vsubl.u8    q8,  d0,  d2
    vld1.64     {d6-d7}, [r2], r3
    vsubl.u8    q12, d1,  d3
    vld1.64     {d4-d5}, [r0,:128], r1
    vsubl.u8    q9,  d4,  d6
    vld1.64     {d2-d3}, [r2], r3
    vsubl.u8    q13, d5,  d7
    vld1.64     {d0-d1}, [r0,:128], r1
    vsubl.u8    q10, d0,  d2
    vld1.64     {d6-d7}, [r2], r3
    vsubl.u8    q14, d1,  d3
    vadd.s16    q0,  q8,  q9
    vld1.64     {d4-d5}, [r0,:128], r1
    vsub.s16    q1,  q8,  q9
    vsubl.u8    q11, d4,  d6
    vsubl.u8    q15, d5,  d7
    SUMSUB_AB   q2,  q3,  q10, q11
    SUMSUB_ABCD q8,  q10, q9,  q11, q0,  q2,  q1,  q3
    b           x264_satd_8x4v_8x8h_neon
.endfunc


function x264_pixel_sa8d_8x8_neon
    mov             ip,  lr
    bl              x264_sa8d_8x8_neon
    vadd.u16        q0,  q8,  q9
    HORIZ_ADD       d0,  d0,  d1
    mov             lr,  ip
    vmov.32         r0,  d0[0]
    add             r0,  r0,  #1
    lsr             r0,  r0,  #1
    bx              lr
.endfunc

function x264_pixel_sa8d_16x16_neon
    vpush           {d8-d11}
    mov             ip,  lr
    bl              x264_sa8d_8x8_neon
    vpaddl.u16      q4,  q8
    vpaddl.u16      q5,  q9
    bl              x264_sa8d_8x8_neon
    vpadal.u16      q4,  q8
    vpadal.u16      q5,  q9
    sub             r0,  r0,  r1,  lsl #4
    sub             r2,  r2,  r3,  lsl #4
    add             r0,  r0,  #8
    add             r2,  r2,  #8
    bl              x264_sa8d_8x8_neon
    vpadal.u16      q4,  q8
    vpadal.u16      q5,  q9
    bl              x264_sa8d_8x8_neon
    vpaddl.u16      q8,  q8
    vpaddl.u16      q9,  q9
    vadd.u32        q0,  q4,  q8
    vadd.u32        q1,  q5,  q9
    vadd.u32        q0,  q0,  q1
    vadd.u32        d0,  d0,  d1
    vpadd.u32       d0,  d0,  d0
    vpop            {d8-d11}
    mov             lr,  ip
    vmov.32         r0,  d0[0]
    add             r0,  r0,  #1
    lsr             r0,  r0,  #1
    bx              lr
.endfunc

.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
    SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
    SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
.endm

function x264_sa8d_8x8_neon
    LOAD_DIFF_8x4   q8,  q9,  q10, q11
    vld1.64         {d7}, [r2], r3
    SUMSUB_AB       q0,  q1,  q8,  q9
    vld1.64         {d6}, [r0,:64], r1
    vsubl.u8        q12, d6,  d7
    vld1.64         {d17}, [r2], r3
    SUMSUB_AB       q2,  q3,  q10, q11
    vld1.64         {d16}, [r0,:64], r1
    vsubl.u8        q13, d16, d17
    vld1.64         {d19}, [r2], r3
    SUMSUB_AB       q8,  q10, q0,  q2
    vld1.64         {d18}, [r0,:64], r1
    vsubl.u8        q14, d18, d19
    vld1.64         {d1}, [r2], r3
    SUMSUB_AB       q9,  q11, q1,  q3
    vld1.64         {d0}, [r0,:64], r1
    vsubl.u8        q15, d0,  d1

    HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
    SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
    SUMSUB_AB       q2,  q10, q10, q14
    vtrn.16         q8,  q9
    SUMSUB_AB       q3,  q11, q11, q15
    vtrn.16         q0,  q1
    SUMSUB_AB       q12, q13, q8,  q9
    vtrn.16         q10, q11
    SUMSUB_AB       q8,  q9,  q0,  q1
    vtrn.16         q2,  q3
    SUMSUB_AB       q14, q15, q10, q11
    vadd.i16        q10, q2,  q3
    vtrn.32         q12, q14
    vsub.i16        q11, q2,  q3
    vtrn.32         q13, q15
    SUMSUB_AB       q0,  q2,  q12, q14
    vtrn.32         q8,  q10
    SUMSUB_AB       q1,  q3,  q13, q15
    vtrn.32         q9,  q11
    SUMSUB_AB       q12, q14, q8,  q10
    SUMSUB_AB       q13, q15, q9,  q11

    vswp            d1,  d24
    ABS2            q0,  q12
    vswp            d3,  d26
    ABS2            q1,  q13
    vswp            d5,  d28
    ABS2            q2,  q14
    vswp            d7,  d30
    ABS2            q3,  q15
    vmax.s16        q8,  q0,  q12
    vmax.s16        q9,  q1,  q13
    vmax.s16        q10, q2,  q14
    vmax.s16        q11, q3,  q15
    vadd.i16        q8,  q8,  q9
    vadd.i16        q9,  q10, q11
    bx              lr
.endfunc


.macro HADAMARD_AC w h
function x264_pixel_hadamard_ac_\w\()x\h\()_neon
    vpush           {d8-d15}
    movrel          ip, mask_ac4
    vmov.i8         q4, #0
    // note: this assumes mask_ac8 is after mask_ac4 (so don't move it)
    vld1.64         {d12-d15}, [ip,:128]
    vmov.i8         q5, #0

    mov             ip,  lr
    bl              x264_hadamard_ac_8x8_neon
.if \h > 8
    bl              x264_hadamard_ac_8x8_neon
.endif
.if \w > 8
    sub             r0,  r0,  r1,  lsl #3
    add             r0,  r0,  #8
    bl              x264_hadamard_ac_8x8_neon
.endif
.if \w * \h == 256
    sub             r0,  r0,  r1,  lsl #4
    bl              x264_hadamard_ac_8x8_neon
.endif

    vadd.s32        d8,  d8,  d9
    vadd.s32        d10, d10, d11
    vpadd.s32       d0,  d8,  d10
    vpop            {d8-d15}
    mov             lr,  ip
    vmov            r0,  r1,  d0
    lsr             r0,  r0,  #1
    lsr             r1,  r1,  #2
    bx              lr
.endfunc
.endm

HADAMARD_AC  8, 8
HADAMARD_AC  8, 16
HADAMARD_AC 16, 8
HADAMARD_AC 16, 16

// q4: satd  q5: sa8d  q6: mask_ac4  q7: mask_ac8
function x264_hadamard_ac_8x8_neon
    vld1.64         {d2},  [r0,:64], r1
    vld1.64         {d3},  [r0,:64], r1
    vaddl.u8        q0,  d2,  d3
    vld1.64         {d6},  [r0,:64], r1
    vsubl.u8        q1,  d2,  d3
    vld1.64         {d7},  [r0,:64], r1
    vaddl.u8        q2,  d6,  d7
    vld1.64         {d18}, [r0,:64], r1
    vsubl.u8        q3,  d6,  d7
    vld1.64         {d19}, [r0,:64], r1
    vaddl.u8        q8,  d18, d19
    vld1.64         {d22}, [r0,:64], r1
    vsubl.u8        q9,  d18, d19
    vld1.64         {d23}, [r0,:64], r1

    SUMSUB_ABCD     q12, q14, q13, q15, q0,  q2,  q1,  q3
    vaddl.u8        q10, d22, d23
    vsubl.u8        q11, d22, d23
    vtrn.16         q12, q13
    SUMSUB_ABCD     q0,  q2,  q1,  q3,  q8,  q10, q9,  q11

    vtrn.16         q14, q15
    SUMSUB_AB       q8,  q9,  q12, q13
    vtrn.16         q0,  q1
    SUMSUB_AB       q10, q11, q14, q15
    vtrn.16         q2,  q3
    SUMSUB_AB       q12, q13, q0,  q1
    vtrn.32         q8,  q10
    SUMSUB_AB       q14, q15, q2,  q3
    vtrn.32         q9,  q11
    SUMSUB_AB       q0,  q2,  q8,  q10
    vtrn.32         q12, q14
    SUMSUB_AB       q1,  q3,  q9,  q11
    vtrn.32         q13, q15
    SUMSUB_ABCD     q8,  q10, q9,  q11, q12, q14, q13, q15

    vabs.s16        q12, q0
    vabs.s16        q13, q8
    vabs.s16        q15, q1
    vadd.s16        q12, q12, q13
    vabs.s16        q14, q2
    vand.s16        q12, q12, q6
    vabs.s16        q13, q3
    vadd.s16        q12, q12, q15
    vabs.s16        q15, q9
    vadd.s16        q12, q12, q14
    vabs.s16        q14, q10
    vadd.s16        q12, q12, q13
    vabs.s16        q13, q11
    vadd.s16        q12, q12, q15
    vsub.s16        q15, q11, q3
    vadd.s16        q12, q12, q14
    vadd.s16        q14, q11, q3
    vadd.s16        q12, q12, q13
    vsub.s16        q13, q10, q2
    vadd.s16        q2,  q10, q2
    vpadal.u16      q4,  q12

    SUMSUB_AB       q10, q11, q9,  q1
    SUMSUB_AB       q9,  q8,  q0,  q8
    vswp            d29, d30
    vabs.s16        q14, q14
    vabs.s16        q15, q15
    vswp            d5,  d26
    vabs.s16        q2,  q2
    vabs.s16        q13, q13
    vswp            d21, d22
    vabs.s16        q10, q10
    vabs.s16        q11, q11
    vmax.s16        q3,  q14, q15
    vmax.s16        q2,  q2,  q13
    vmax.s16        q1,  q10, q11
    vswp            d19, d16
    SUMSUB_AB       q14, q15, q9,  q8

    vadd.s16        q2,  q2,  q3
    vadd.s16        q2,  q2,  q1
    vand            q14, q14, q7
    vadd.s16        q2,  q2,  q2
    vabs.s16        q15, q15
    vabs.s16        q14, q14
    vadd.s16        q2,  q2,  q15
    vadd.s16        q2,  q2,  q14
    vpadal.u16      q5,  q2
    bx              lr
.endfunc


.macro SSIM_ITER n ssa s12 ssb lastssa lasts12 lastssb da db dnext
    vld1.64     {\db}, [r2], r3
    vmull.u8    \ssa,  \da, \da
    vmull.u8    \s12,  \da, \db
.if \n == 1
    vpaddl.u16  q2,  \lastssa
    vpaddl.u16  q3,  \lasts12
    vaddl.u8    q0,  d0,  \da
.else
    vpadal.u16  q2,  \lastssa
    vpadal.u16  q3,  \lasts12
    vaddw.u8    q0,  q0,  \da
.endif
    vpadal.u16  q2,  \lastssb
.if \n < 3
    vld1.64     {\dnext}, [r0], r1
.endif
.if \n == 1
    vaddl.u8    q1,  d2,  \db
.else
    vaddw.u8    q1,  q1,  \db
.endif
    vmull.u8    \ssb, \db, \db
.endm

function x264_pixel_ssim_4x4x2_core_neon
    ldr         ip, [sp]
    vld1.64     {d0}, [r0], r1
    vld1.64     {d2}, [r2], r3
    vmull.u8    q2,  d0,  d0
    vmull.u8    q3,  d0,  d2
    vld1.64     {d28}, [r0], r1
    vmull.u8    q15, d2,  d2

    SSIM_ITER 1, q8, q9, q14,  q2, q3, q15,  d28, d29, d26
    SSIM_ITER 2, q10,q11,q13,  q8, q9, q14,  d26, d27, d28
    SSIM_ITER 3, q8, q9, q15,  q10,q11,q13,  d28, d29

    vpadal.u16  q2,  q8
    vpaddl.u16  q0,  q0
    vpaddl.u16  q1,  q1
    vpadal.u16  q2,  q15
    vpadal.u16  q3,  q9

    vpadd.u32   d0,  d0,  d1
    vpadd.u32   d1,  d2,  d3
    vpadd.u32   d2,  d4,  d5
    vpadd.u32   d3,  d6,  d7

    vst4.32     {d0-d3}, [ip]
    bx          lr
.endfunc

// FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2
function x264_pixel_ssim_end4_neon
    vld1.32     {d16-d19}, [r0,:128]!
    vld1.32     {d20-d23}, [r1,:128]!
    vadd.s32    q0,  q8,  q10
    vadd.s32    q1,  q9,  q11
    vld1.32     {d24-d27}, [r0,:128]!
    vadd.s32    q0,  q0,  q1
    vld1.32     {d28-d31}, [r1,:128]!
    vadd.s32    q2,  q12, q14
    vadd.s32    q3,  q13, q15
    vld1.32     {d16-d17}, [r0,:128]
    vadd.s32    q1,  q1,  q2
    vld1.32     {d18-d19}, [r1,:128]
    vadd.s32    q8,  q8,  q9
    vadd.s32    q2,  q2,  q3
    vadd.s32    q3,  q3,  q8

    vtrn.32     q0,  q1
    vtrn.32     q2,  q3
    vswp        d1,  d4
    vswp        d3,  d6

//  s1=q0, s2=q1, ss=q2, s12=q3
    vmul.s32    q8,  q0,  q1    // s1*s2
    vmul.s32    q0,  q0,  q0
    vmla.s32    q0,  q1,  q1    // s1*s1 + s2*s2

    vshl.s32    q3,  q3,  #7
    vshl.s32    q2,  q2,  #6
    vadd.s32    q1,  q8,  q8

    mov         r3, #416        // ssim_c1 = .01*.01*255*255*64
    movconst    ip, 235963      // ssim_c2 = .03*.03*255*255*64*63
    vdup.32     q14, r3
    vdup.32     q15, ip

    vsub.s32    q2,  q2,  q0    // vars
    vsub.s32    q3,  q3,  q1    // covar*2
    vadd.s32    q0,  q0,  q14
    vadd.s32    q2,  q2,  q15
    vadd.s32    q1,  q1,  q14
    vadd.s32    q3,  q3,  q15

    vcvt.f32.s32    q0,  q0
    vcvt.f32.s32    q2,  q2
    vcvt.f32.s32    q1,  q1
    vcvt.f32.s32    q3,  q3

    vmul.f32    q0,  q0,  q2
    vmul.f32    q1,  q1,  q3

    cmp         r2,  #4

    vdiv.f32    s0,  s4,  s0
    vdiv.f32    s1,  s5,  s1
    vdiv.f32    s2,  s6,  s2
    vdiv.f32    s3,  s7,  s3

    beq         ssim_skip
    movrel      r3,  mask_ff
    sub         r3,  r3,  r2,  lsl #2
    vld1.64     {d6-d7}, [r3]
    vand        q0,  q0,  q3
ssim_skip:
    vadd.f32    d0,  d0,  d1
    vpadd.f32   d0,  d0,  d0
    vmov.32     r0,  d0[0]
    bx          lr
.endfunc
x264-snapshot-20120103-2245-stable/common/arm/mc.h0000644000175000001440000000245611700673342020517 0ustar  videolanusers/*****************************************************************************
 * mc.h: arm motion compensation
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ARM_MC_H
#define X264_ARM_MC_H

void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf );

#endif
x264-snapshot-20120103-2245-stable/common/arm/mc-c.c0000644000175000001440000002327411700673342020733 0ustar  videolanusers/*****************************************************************************
 * mc-c.c: arm motion compensation
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "common/common.h"
#include "mc.h"

void x264_prefetch_ref_arm( uint8_t *, int, int );
void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );

void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
void x264_memzero_aligned_neon( void *dst, int n );

void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );

void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );

#define MC_WEIGHT(func)\
void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
\
static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
{\
    x264_mc_weight_w4##func##_neon,\
    x264_mc_weight_w4##func##_neon,\
    x264_mc_weight_w8##func##_neon,\
    x264_mc_weight_w16##func##_neon,\
    x264_mc_weight_w16##func##_neon,\
    x264_mc_weight_w20##func##_neon,\
};

MC_WEIGHT()
MC_WEIGHT(_nodenom)
MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub)

void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );

void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);

void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );

#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
        {
            w->weightfn = x264_mc_offsetsub_wtab_neon;
            w->cachea[0] = -w->i_offset;
        }
        else
        {
            w->weightfn = x264_mc_offsetadd_wtab_neon;
            w->cachea[0] = w->i_offset;
        }
    }
    else if( !w->i_denom )
        w->weightfn = x264_mc_nodenom_wtab_neon;
    else
        w->weightfn = x264_mc_wtab_neon;
}

static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
{
    NULL,
    x264_pixel_avg2_w4_neon,
    x264_pixel_avg2_w8_neon,
    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
    x264_pixel_avg2_w16_neon,
    x264_pixel_avg2_w20_neon,
};

static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) =
{
    NULL,
    x264_mc_copy_w4_neon,
    x264_mc_copy_w8_neon,
    NULL,
    x264_mc_copy_w16_neon,
};

static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};

static void mc_luma_neon( uint8_t *dst,    int i_dst_stride,
                          uint8_t *src[4], int i_src_stride,
                          int mvx, int mvy,
                          int i_width, int i_height, const x264_weight_t *weight )
{
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
    if ( (mvy&3) == 3 )             // explict if() to force conditional add
        src1 += i_src_stride;

    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        x264_pixel_avg_wtab_neon[i_width>>2](
                dst, i_dst_stride, src1, i_src_stride,
                src2, i_height );
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
    }
    else if( weight->weightfn )
        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
    else
        x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
}

static uint8_t *get_ref_neon( uint8_t *dst,   int *i_dst_stride,
                              uint8_t *src[4], int i_src_stride,
                              int mvx, int mvy,
                              int i_width, int i_height, const x264_weight_t *weight )
{
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
    if ( (mvy&3) == 3 )             // explict if() to force conditional add
        src1 += i_src_stride;

    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        x264_pixel_avg_wtab_neon[i_width>>2](
                dst, *i_dst_stride, src1, i_src_stride,
                src2, i_height );
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
}

static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
                              int stride, int width, int height, int16_t *buf )
{
    int realign = (intptr_t)src & 15;
    src -= realign;
    dstv -= realign;
    dstc -= realign;
    dsth -= realign;
    width += realign;
    while( height-- )
    {
        x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
        x264_hpel_filter_c_neon( dstc, buf+8, width );
        x264_hpel_filter_h_neon( dsth, src, width );
        dsth += stride;
        dstv += stride;
        dstc += stride;
        src  += stride;
    }
}
#endif // !HIGH_BIT_DEPTH

void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
{
    if( !(cpu&X264_CPU_ARMV6) )
        return;

#if !HIGH_BIT_DEPTH
    pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
    pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
    pf->prefetch_ref  = x264_prefetch_ref_arm;
#endif // !HIGH_BIT_DEPTH

    if( !(cpu&X264_CPU_NEON) )
        return;

#if !HIGH_BIT_DEPTH
    pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;

    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;

    pf->weight    = x264_mc_wtab_neon;
    pf->offsetadd = x264_mc_offsetadd_wtab_neon;
    pf->offsetsub = x264_mc_offsetsub_wtab_neon;
    pf->weight_cache = x264_weight_cache_neon;

//  pf->mc_chroma = x264_mc_chroma_neon;
    pf->mc_luma = mc_luma_neon;
    pf->get_ref = get_ref_neon;
    pf->hpel_filter = hpel_filter_neon;
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
#endif // !HIGH_BIT_DEPTH

// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
#ifndef SYS_MACOSX
    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
#endif
    pf->memzero_aligned = x264_memzero_aligned_neon;
}
x264-snapshot-20120103-2245-stable/common/arm/mc-a.S0000644000175000001440000011436411700673342020712 0ustar  videolanusers/*****************************************************************************
 * mc.S: arm motion compensation
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Mans Rullgard <mans@mansr.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "asm.S"

.fpu neon
.text

// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
// They also use nothing above armv5te, but we don't care about pre-armv6

// void prefetch_ref( uint8_t *pix, int stride, int parity )
function x264_prefetch_ref_arm
    sub         r2, r2, #1
    add         r0, r0, #64
    and         r2, r2, r1
    add         r0, r0, r2, lsl #3
    add         r2, r1, r1, lsl #1
    pld         [r0]
    pld         [r0, r1]
    pld         [r0, r1, lsl #1]
    add         r3, r0, r1, lsl #2
    pld         [r0, r2]
    pld         [r3]
    pld         [r3, r1]
    pld         [r3, r1, lsl #1]
    pld         [r3, r2]
    bx          lr
.endfunc

// void prefetch_fenc( uint8_t *pix_y, int stride_y,
//                     uint8_t *pix_uv, int stride_uv, int mb_x )
function x264_prefetch_fenc_arm
    ldr         ip, [sp]
    push        {lr}
    and         lr, ip, #3
    smulbb      lr, lr, r1      // note: this assumes stride_y is <= 16 bits signed
    and         ip, ip, #6
    smulbb      ip, ip, r3
    add         r0, r0, #64
    add         r2, r2, #64
    add         r0, r0, lr, lsl #2
    pld         [r0]
    add         lr, r0, r1, lsl #1
    pld         [r0, r1]
    pld         [lr]
    add         r2, r2, ip, lsl #2
    pld         [lr, r1]
    pld         [r2]
    add         ip, r2, r3, lsl #1
    pld         [r2, r3]
    pld         [ip]
    pld         [ip, r3]
    pop         {pc}
.endfunc


// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
function x264_memcpy_aligned_neon
    orr         r3,  r0,  r1,  lsr #1
    movrel      ip,  memcpy_table
    and         r3,  r3,  #0xc
    ldr         pc,  [ip, r3]
.endfunc

.macro MEMCPY_ALIGNED srcalign dstalign
function memcpy_aligned_\dstalign\()_\srcalign\()_neon
    mov         r3, r0
.if \srcalign == 8 && \dstalign == 8
    sub         r2, #16
    vld1.64     {d0}, [r1,:64]!
    vst1.64     {d0}, [r3,:64]!
    .set r1align, 128
    .set r3align, 128
.else
    .set r1align, \srcalign * 8
    .set r3align, \dstalign * 8
.endif
    tst         r2, #16
    beq         32f
    sub         r2, #16
    vld1.64     {d0-d1}, [r1,:r1align]!
    vst1.64     {d0-d1}, [r3,:r3align]!
32: // n is a multiple of 32
    tst         r2, #32
    beq         640f
    sub         r2, #32
    vld1.64     {d0-d3}, [r1,:r1align]!
    vst1.64     {d0-d3}, [r3,:r3align]!
640: // n is a multiple of 64
    cmp         r2, #0
    beq         1f
64:
    subs        r2, #64
    vld1.64     {d0-d3}, [r1,:r1align]!
    vld1.64     {d4-d7}, [r1,:r1align]!
    vst1.64     {d0-d3}, [r3,:r3align]!
    vst1.64     {d4-d7}, [r3,:r3align]!
    bgt         64b
1:   // end
.if \srcalign == 8 && \dstalign == 8
    vld1.64     {d0}, [r1,:64]!
    vst1.64     {d0}, [r3,:64]!
.endif
    bx          lr
.endfunc
.endm

MEMCPY_ALIGNED 16, 16
MEMCPY_ALIGNED 16, 8
MEMCPY_ALIGNED  8, 16
MEMCPY_ALIGNED  8, 8

.section .rodata
memcpy_table:
.word memcpy_aligned_16_16_neon
.word memcpy_aligned_16_8_neon
.word memcpy_aligned_8_16_neon
.word memcpy_aligned_8_8_neon
.text

.ltorg

// void x264_memzero_aligned( void *dst, size_t n )
function x264_memzero_aligned_neon
    vmov.i8     q0, #0
    vmov.i8     q1, #0
memzero_loop:
    subs        r1, #128
.rept 4
    vst1.64     {d0-d3}, [r0,:128]!
.endr
    bgt         memzero_loop
    bx          lr
.endfunc


// void pixel_avg( uint8_t *dst, int dst_stride,
//                 uint8_t *src1, int src1_stride,
//                 uint8_t *src2, int src2_stride, int weight );
.macro AVGH w h
function x264_pixel_avg_\w\()x\h\()_neon
    ldr         ip, [sp, #8]
    push        {r4-r6,lr}
    cmp         ip, #32
    ldrd        r4, [sp, #16]
    mov         lr, #\h
    beq         x264_pixel_avg_w\w\()_neon
    rsbs        r6,  ip,  #64
    blt         x264_pixel_avg_weight_w\w\()_add_sub_neon     // weight > 64
    cmp         ip,  #0
    bge         x264_pixel_avg_weight_w\w\()_add_add_neon
    b           x264_pixel_avg_weight_w\w\()_sub_add_neon     // weight < 0
.endfunc
.endm

AVGH  4, 2
AVGH  4, 4
AVGH  4, 8
AVGH  8, 4
AVGH  8, 8
AVGH  8, 16
AVGH 16, 8
AVGH 16, 16

// 0 < weight < 64
.macro load_weights_add_add
    vdup.8      d30, ip
    vdup.8      d31, r6
.endm

.macro load_add_add d1 d2
    vld1.32     {\d1}, [r2], r3
    vld1.32     {\d2}, [r4], r5
.endm

.macro weight_add_add dst s1 s2
    vmull.u8    \dst, \s1, d30
    vmlal.u8    \dst, \s2, d31
.endm

// weight > 64
.macro load_weights_add_sub
    rsb         r6,  #0
    vdup.8      d30, ip
    vdup.8      d31, r6
.endm

.macro load_add_sub d1 d2
    vld1.32     {\d1}, [r2], r3
    vld1.32     {\d2}, [r4], r5
.endm

.macro weight_add_sub dst s1 s2
    vmull.u8    \dst, \s1, d30
    vmlsl.u8    \dst, \s2, d31
.endm

// weight < 0
.macro load_weights_sub_add
    rsb         ip,  #0
    vdup.8      d31, r6
    vdup.8      d30, ip
.endm

.macro load_sub_add d1 d2
    vld1.32     {\d2}, [r4], r5
    vld1.32     {\d1}, [r2], r3
.endm

.macro weight_sub_add dst s1 s2
    vmull.u8    \dst, \s2, d31
    vmlsl.u8    \dst, \s1, d30
.endm

.macro AVG_WEIGHT ext
function x264_pixel_avg_weight_w4_\ext\()_neon
    load_weights_\ext
1:  // height loop
    subs            lr,  lr,  #2
    load_\ext       d0[], d1[]
    weight_\ext     q8,  d0,  d1
    load_\ext       d2[], d3[]
    vqrshrun.s16    d0,  q8,  #6
    weight_\ext     q9,  d2,  d3
    vst1.32         {d0[0]}, [r0,:32], r1
    vqrshrun.s16    d1,  q9,  #6
    vst1.32         {d1[0]}, [r0,:32], r1
    bgt             1b
    pop             {r4-r6,pc}
.endfunc

function x264_pixel_avg_weight_w8_\ext\()_neon
    load_weights_\ext
1:  // height loop
    subs            lr,  lr,  #4
    load_\ext       d0,  d1
    weight_\ext     q8,  d0,  d1
    load_\ext       d2,  d3
    weight_\ext     q9,  d2,  d3
    load_\ext       d4,  d5
    weight_\ext     q10, d4,  d5
    load_\ext       d6,  d7
    weight_\ext     q11, d6,  d7
    vqrshrun.s16    d0,  q8,  #6
    vqrshrun.s16    d1,  q9,  #6
    vqrshrun.s16    d2,  q10, #6
    vqrshrun.s16    d3,  q11, #6
    vst1.64         {d0}, [r0,:64], r1
    vst1.64         {d1}, [r0,:64], r1
    vst1.64         {d2}, [r0,:64], r1
    vst1.64         {d3}, [r0,:64], r1
    bgt             1b
    pop             {r4-r6,pc}
.endfunc

function x264_pixel_avg_weight_w16_\ext\()_neon
    load_weights_\ext
1:  // height loop
    subs            lr,  lr,  #2
    load_\ext       d0-d1, d2-d3
    weight_\ext     q8,  d0,  d2
    weight_\ext     q9,  d1,  d3
    load_\ext       d4-d5, d6-d7
    weight_\ext     q10, d4,  d6
    weight_\ext     q11, d5,  d7
    vqrshrun.s16    d0,  q8,  #6
    vqrshrun.s16    d1,  q9,  #6
    vqrshrun.s16    d2,  q10, #6
    vqrshrun.s16    d3,  q11, #6
    vst1.64         {d0-d1}, [r0,:128], r1
    vst1.64         {d2-d3}, [r0,:128], r1
    bgt             1b
    pop             {r4-r6,pc}
.endfunc
.endm

AVG_WEIGHT add_add
AVG_WEIGHT add_sub
AVG_WEIGHT sub_add

function x264_pixel_avg_w4_neon
    subs        lr,  lr,  #2
    vld1.32     {d0[]}, [r2], r3
    vld1.32     {d2[]}, [r4], r5
    vrhadd.u8   d0,  d0,  d2
    vld1.32     {d1[]}, [r2], r3
    vld1.32     {d3[]}, [r4], r5
    vrhadd.u8   d1,  d1,  d3
    vst1.32     {d0[0]}, [r0,:32], r1
    vst1.32     {d1[0]}, [r0,:32], r1
    bgt         x264_pixel_avg_w4_neon
    pop         {r4-r6,pc}
.endfunc

function x264_pixel_avg_w8_neon
    subs        lr,  lr,  #4
    vld1.64     {d0}, [r2], r3
    vld1.64     {d2}, [r4], r5
    vrhadd.u8   d0,  d0,  d2
    vld1.64     {d1}, [r2], r3
    vld1.64     {d3}, [r4], r5
    vrhadd.u8   d1,  d1,  d3
    vst1.64     {d0}, [r0,:64], r1
    vld1.64     {d2}, [r2], r3
    vld1.64     {d4}, [r4], r5
    vrhadd.u8   d2,  d2,  d4
    vst1.64     {d1}, [r0,:64], r1
    vld1.64     {d3}, [r2], r3
    vld1.64     {d5}, [r4], r5
    vrhadd.u8   d3,  d3,  d5
    vst1.64     {d2}, [r0,:64], r1
    vst1.64     {d3}, [r0,:64], r1
    bgt         x264_pixel_avg_w8_neon
    pop         {r4-r6,pc}
.endfunc

function x264_pixel_avg_w16_neon
    subs        lr,  lr,  #4
    vld1.64     {d0-d1}, [r2], r3
    vld1.64     {d2-d3}, [r4], r5
    vrhadd.u8   q0,  q0,  q1
    vld1.64     {d2-d3}, [r2], r3
    vld1.64     {d4-d5}, [r4], r5
    vrhadd.u8   q1,  q1,  q2
    vst1.64     {d0-d1}, [r0,:128], r1
    vld1.64     {d4-d5}, [r2], r3
    vld1.64     {d6-d7}, [r4], r5
    vrhadd.u8   q2,  q2,  q3
    vst1.64     {d2-d3}, [r0,:128], r1
    vld1.64     {d6-d7}, [r2], r3
    vld1.64     {d0-d1}, [r4], r5
    vrhadd.u8   q3,  q3,  q0
    vst1.64     {d4-d5}, [r0,:128], r1
    vst1.64     {d6-d7}, [r0,:128], r1
    bgt         x264_pixel_avg_w16_neon
    pop         {r4-r6,pc}
.endfunc


function x264_pixel_avg2_w4_neon
    ldr         ip,  [sp, #4]
    push        {lr}
    ldr         lr,  [sp, #4]
avg2_w4_loop:
    subs        ip,  ip,  #2
    vld1.32     {d0[]},  [r2], r3
    vld1.32     {d2[]},  [lr], r3
    vrhadd.u8   d0,  d0,  d2
    vld1.32     {d1[]},  [r2], r3
    vld1.32     {d3[]},  [lr], r3
    vrhadd.u8   d1,  d1,  d3
    vst1.32     {d0[0]}, [r0,:32], r1
    vst1.32     {d1[0]}, [r0,:32], r1
    bgt         avg2_w4_loop
    pop         {pc}
.endfunc

function x264_pixel_avg2_w8_neon
    ldr         ip,  [sp, #4]
    push        {lr}
    ldr         lr,  [sp, #4]
avg2_w8_loop:
    subs        ip,  ip,  #2
    vld1.64     {d0}, [r2], r3
    vld1.64     {d2}, [lr], r3
    vrhadd.u8   d0,  d0,  d2
    vld1.64     {d1}, [r2], r3
    vld1.64     {d3}, [lr], r3
    vrhadd.u8   d1,  d1,  d3
    vst1.64     {d0}, [r0,:64], r1
    vst1.64     {d1}, [r0,:64], r1
    bgt         avg2_w8_loop
    pop         {pc}
.endfunc

function x264_pixel_avg2_w16_neon
    ldr         ip,  [sp, #4]
    push        {lr}
    ldr         lr,  [sp, #4]
avg2_w16_loop:
    subs        ip,  ip,  #2
    vld1.64     {d0-d1}, [r2], r3
    vld1.64     {d2-d3}, [lr], r3
    vrhadd.u8   q0,  q0,  q1
    vld1.64     {d4-d5}, [r2], r3
    vld1.64     {d6-d7}, [lr], r3
    vrhadd.u8   q2,  q2,  q3
    vst1.64     {d0-d1}, [r0,:128], r1
    vst1.64     {d4-d5}, [r0,:128], r1
    bgt         avg2_w16_loop
    pop         {pc}
.endfunc

function x264_pixel_avg2_w20_neon
    ldr         ip,  [sp, #4]
    push        {lr}
    sub         r1,  r1,  #16
    ldr         lr,  [sp, #4]
avg2_w20_loop:
    subs        ip,  ip,  #2
    vld1.64     {d0-d2},  [r2], r3
    vld1.64     {d4-d6},  [lr], r3
    vrhadd.u8   q0,  q0,  q2
    vrhadd.u8   d2,  d2,  d6
    vld1.64     {d4-d6},  [r2], r3
    vld1.64     {d16-d18},[lr], r3
    vrhadd.u8   q2,  q2,  q8
    vst1.64     {d0-d1},  [r0,:128]!
    vrhadd.u8   d6,  d6,  d18
    vst1.32     {d2[0]},  [r0,:32], r1
    vst1.64     {d4-d5},  [r0,:128]!
    vst1.32     {d6[0]},  [r0,:32], r1
    bgt         avg2_w20_loop
    pop         {pc}
.endfunc


.macro weight_prologue type
    push        {r4-r5,lr}
    ldr         r4,  [sp, #4*3]     // weight_t
    ldr         ip,  [sp, #4*3+4]   // h
.ifc \type, full
    ldr         lr,  [r4, #32]      // denom
.endif
    ldrd        r4,  [r4, #32+4]    // scale, offset
    vdup.16     q0,  r4
    vdup.16     q1,  r5
.ifc \type, full
    rsb         lr,  lr,  #0
    vdup.16     q2,  lr
.endif
.endm

// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
//                 const x264_weight_t *weight, int height )
function x264_mc_weight_w20_neon
    weight_prologue full
    sub         r1, #16
weight20_loop:
    subs        ip,  #2
    vld1.8      {d17-d19}, [r2], r3
    vmovl.u8    q10, d17
    vmovl.u8    q11, d18
    vmovl.u8    q14, d19
    vld1.8      {d16-d18}, [r2], r3
    vmovl.u8    q12, d16
    vmovl.u8    q13, d17
    vmovl.u8    q15, d18
    vmul.s16    q10, q10, q0
    vmul.s16    q11, q11, q0
    vmul.s16    q12, q12, q0
    vmul.s16    q13, q13, q0
    vmul.s16    d28, d28, d0
    vmul.s16    d29, d30, d0
    vrshl.s16   q10, q10, q2
    vrshl.s16   q11, q11, q2
    vrshl.s16   q12, q12, q2
    vrshl.s16   q13, q13, q2
    vrshl.s16   q14, q14, q2
    vadd.s16    q10, q10, q1
    vadd.s16    q11, q11, q1
    vadd.s16    q12, q12, q1
    vadd.s16    q13, q13, q1
    vadd.s16    q14, q14, q1
    vqmovun.s16 d16, q10
    vqmovun.s16 d17, q11
    vqmovun.s16 d18, q12
    vqmovun.s16 d19, q13
    vqmovun.s16 d20, q14
    vst1.8      {d16-d17}, [r0,:128]!
    vst1.32     {d20[0]},  [r0,:32], r1
    vst1.8      {d18-d19}, [r0,:128]!
    vst1.32     {d20[1]},  [r0,:32], r1
    bgt         weight20_loop
    pop         {r4-r5,pc}
.endfunc

function x264_mc_weight_w16_neon
    weight_prologue full
weight16_loop:
    subs        ip,  #2
    vld1.8      {d16-d17}, [r2], r3
    vld1.8      {d18-d19}, [r2], r3
    vmovl.u8    q10, d16
    vmovl.u8    q11, d17
    vmovl.u8    q12, d18
    vmovl.u8    q13, d19
    vmul.s16    q10, q10, q0
    vmul.s16    q11, q11, q0
    vmul.s16    q12, q12, q0
    vmul.s16    q13, q13, q0
    vrshl.s16   q10, q10, q2
    vrshl.s16   q11, q11, q2
    vrshl.s16   q12, q12, q2
    vrshl.s16   q13, q13, q2
    vadd.s16    q10, q10, q1
    vadd.s16    q11, q11, q1
    vadd.s16    q12, q12, q1
    vadd.s16    q13, q13, q1
    vqmovun.s16 d16, q10
    vqmovun.s16 d17, q11
    vqmovun.s16 d18, q12
    vqmovun.s16 d19, q13
    vst1.8      {d16-d17}, [r0,:128], r1
    vst1.8      {d18-d19}, [r0,:128], r1
    bgt         weight16_loop
    pop         {r4-r5,pc}
.endfunc

function x264_mc_weight_w8_neon
    weight_prologue full
weight8_loop:
    subs        ip,  #2
    vld1.8      {d16}, [r2], r3
    vld1.8      {d18}, [r2], r3
    vmovl.u8    q8,  d16
    vmovl.u8    q9,  d18
    vmul.s16    q8,  q8,  q0
    vmul.s16    q9,  q9,  q0
    vrshl.s16   q8,  q8,  q2
    vrshl.s16   q9,  q9,  q2
    vadd.s16    q8,  q8,  q1
    vadd.s16    q9,  q9,  q1
    vqmovun.s16 d16, q8
    vqmovun.s16 d18, q9
    vst1.8      {d16}, [r0,:64], r1
    vst1.8      {d18}, [r0,:64], r1
    bgt         weight8_loop
    pop         {r4-r5,pc}
.endfunc

function x264_mc_weight_w4_neon
    weight_prologue full
weight4_loop:
    subs        ip,  #2
    vld1.32     {d16[]}, [r2], r3
    vld1.32     {d18[]}, [r2], r3
    vmovl.u8    q8,  d16
    vmovl.u8    q9,  d18
    vmul.s16    d16, d16, d0
    vmul.s16    d17, d18, d0
    vrshl.s16   q8,  q8,  q2
    vadd.s16    q8,  q8,  q1
    vqmovun.s16 d16, q8
    vst1.32     {d16[0]}, [r0,:32], r1
    vst1.32     {d16[1]}, [r0,:32], r1
    bgt         weight4_loop
    pop         {r4-r5,pc}
.endfunc

function x264_mc_weight_w20_nodenom_neon
    weight_prologue nodenom
    sub         r1, #16
weight20_nodenom_loop:
    subs        ip,  #2
    vld1.8      {d17-d19}, [r2], r3
    vmovl.u8    q10, d17
    vmovl.u8    q11, d18
    vmovl.u8    q14, d19
    vld1.8      {d16-d18}, [r2], r3
    vmovl.u8    q12, d16
    vmovl.u8    q13, d17
    vmovl.u8    q15, d18
    vmov        q8,  q1
    vmov        q9,  q1
    vmla.s16    q8,  q10, q0
    vmla.s16    q9,  q11, q0
    vmov        q10, q1
    vmov        q11, q1
    vmla.s16    q10, q12, q0
    vmla.s16    q11, q13, q0
    vmov        q12, q1
    vmla.s16    d24, d28, d0
    vmla.s16    d25, d30, d0
    vqmovun.s16 d16, q8
    vqmovun.s16 d17, q9
    vqmovun.s16 d18, q10
    vqmovun.s16 d19, q11
    vqmovun.s16 d20, q12
    vst1.8      {d16-d17}, [r0,:128]!
    vst1.32     {d20[0]},  [r0,:32], r1
    vst1.8      {d18-d19}, [r0,:128]!
    vst1.32     {d20[1]},  [r0,:32], r1
    bgt         weight20_nodenom_loop
    pop         {r4-r5,pc}
.endfunc

function x264_mc_weight_w16_nodenom_neon
    weight_prologue nodenom
weight16_nodenom_loop:
    subs        ip,  #2
    vld1.8      {d16-d17}, [r2], r3
    vld1.8      {d18-d19}, [r2], r3
    vmovl.u8    q12, d16
    vmovl.u8    q13, d17
    vmovl.u8    q14, d18
    vmovl.u8    q15, d19
    vmov        q8,  q1
    vmov        q9,  q1
    vmov        q10, q1
    vmov        q11, q1
    vmla.s16    q8,  q12, q0
    vmla.s16    q9,  q13, q0
    vmla.s16    q10, q14, q0
    vmla.s16    q11, q15, q0
    vqmovun.s16 d16, q8
    vqmovun.s16 d17, q9
    vqmovun.s16 d18, q10
    vqmovun.s16 d19, q11
    vst1.8      {d16-d17}, [r0,:128], r1
    vst1.8      {d18-d19}, [r0,:128], r1
    bgt         weight16_nodenom_loop
    pop         {r4-r5,pc}
.endfunc

function x264_mc_weight_w8_nodenom_neon
    weight_prologue nodenom
weight8_nodenom_loop:
    subs        ip,  #2
    vld1.8      {d16}, [r2], r3
    vld1.8      {d18}, [r2], r3
    vmovl.u8    q8,  d16
    vmovl.u8    q9,  d18
    vmov        q10, q1
    vmov        q11, q1
    vmla.s16    q10, q8,  q0
    vmla.s16    q11, q9,  q0
    vqmovun.s16 d16, q10
    vqmovun.s16 d17, q11
    vst1.8      {d16}, [r0,:64], r1
    vst1.8      {d17}, [r0,:64], r1
    bgt         weight8_nodenom_loop
    pop         {r4-r5,pc}
.endfunc

function x264_mc_weight_w4_nodenom_neon
    weight_prologue nodenom
weight4_nodenom_loop:
    subs        ip,  #2
    vld1.32     {d16[]}, [r2], r3
    vld1.32     {d18[]}, [r2], r3
    vmovl.u8    q8,  d16
    vmovl.u8    q9,  d18
    vmov        q10, q1
    vmla.s16    d20, d16, d0
    vmla.s16    d21, d18, d0
    vqmovun.s16 d16, q10
    vst1.32     {d16[0]}, [r0,:32], r1
    vst1.32     {d16[1]}, [r0,:32], r1
    bgt         weight4_nodenom_loop
    pop         {r4-r5,pc}
.endfunc

.macro weight_simple_prologue
    push        {lr}
    ldr         lr,  [sp, #4]       // weight_t
    ldr         ip,  [sp, #8]       // h
    ldr         lr,  [lr]           // offset
    vdup.8      q1,  lr
.endm

.macro weight_simple name op
function x264_mc_weight_w20_\name\()_neon
    weight_simple_prologue
weight20_\name\()_loop:
    subs        ip,  #2
    vld1.8      {d16-d18}, [r2], r3
    vld1.8      {d19-d21}, [r2], r3
    \op         q8,  q8,  q1
    \op         q9,  q9,  q1
    \op         q10, q10, q1
    vst1.8      {d16-d18}, [r0,:64], r1
    vst1.8      {d19-d21}, [r0,:64], r1
    bgt         weight20_\name\()_loop
    pop         {pc}
.endfunc

function x264_mc_weight_w16_\name\()_neon
    weight_simple_prologue
weight16_\name\()_loop:
    subs        ip,  #2
    vld1.8      {d16-d17}, [r2], r3
    vld1.8      {d18-d19}, [r2], r3
    \op         q8,  q8,  q1
    \op         q9,  q9,  q1
    vst1.8      {d16-d17}, [r0,:128], r1
    vst1.8      {d18-d19}, [r0,:128], r1
    bgt         weight16_\name\()_loop
    pop         {pc}
.endfunc

function x264_mc_weight_w8_\name\()_neon
    weight_simple_prologue
weight8_\name\()_loop:
    subs        ip,  #2
    vld1.8      {d16}, [r2], r3
    vld1.8      {d17}, [r2], r3
    \op         q8,  q8,  q1
    vst1.8      {d16}, [r0,:64], r1
    vst1.8      {d17}, [r0,:64], r1
    bgt         weight8_\name\()_loop
    pop         {pc}
.endfunc

function x264_mc_weight_w4_\name\()_neon
    weight_simple_prologue
weight4_\name\()_loop:
    subs        ip,  #2
    vld1.32     {d16[]}, [r2], r3
    vld1.32     {d17[]}, [r2], r3
    \op         q8,  q8,  q1
    vst1.32     {d16[0]}, [r0,:32], r1
    vst1.32     {d17[0]}, [r0,:32], r1
    bgt         weight4_\name\()_loop
    pop         {pc}
.endfunc
.endm

weight_simple offsetadd, vqadd.u8
weight_simple offsetsub, vqsub.u8


// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
function x264_mc_copy_w4_neon
    ldr         ip,  [sp]
copy_w4_loop:
    subs        ip,  ip,  #4
    vld1.32     {d0[]},  [r2], r3
    vld1.32     {d1[]},  [r2], r3
    vld1.32     {d2[]},  [r2], r3
    vld1.32     {d3[]},  [r2], r3
    vst1.32     {d0[0]}, [r0,:32], r1
    vst1.32     {d1[0]}, [r0,:32], r1
    vst1.32     {d2[0]}, [r0,:32], r1
    vst1.32     {d3[0]}, [r0,:32], r1
    bgt         copy_w4_loop
    bx          lr
.endfunc

function x264_mc_copy_w8_neon
    ldr         ip,  [sp]
copy_w8_loop:
    subs        ip,  ip,  #4
    vld1.32     {d0}, [r2], r3
    vld1.32     {d1}, [r2], r3
    vld1.32     {d2}, [r2], r3
    vld1.32     {d3}, [r2], r3
    vst1.32     {d0}, [r0,:64], r1
    vst1.32     {d1}, [r0,:64], r1
    vst1.32     {d2}, [r0,:64], r1
    vst1.32     {d3}, [r0,:64], r1
    bgt         copy_w8_loop
    bx          lr
.endfunc

function x264_mc_copy_w16_neon
    ldr         ip,  [sp]
copy_w16_loop:
    subs        ip,  ip,  #4
    vld1.32     {d0-d1}, [r2], r3
    vld1.32     {d2-d3}, [r2], r3
    vld1.32     {d4-d5}, [r2], r3
    vld1.32     {d6-d7}, [r2], r3
    vst1.32     {d0-d1}, [r0,:128], r1
    vst1.32     {d2-d3}, [r0,:128], r1
    vst1.32     {d4-d5}, [r0,:128], r1
    vst1.32     {d6-d7}, [r0,:128], r1
    bgt         copy_w16_loop
    bx          lr
.endfunc

function x264_mc_copy_w16_aligned_neon
    ldr         ip,  [sp]
copy_w16_aligned_loop:
    subs        ip,  ip,  #4
    vld1.32     {d0-d1}, [r2,:128], r3
    vld1.32     {d2-d3}, [r2,:128], r3
    vld1.32     {d4-d5}, [r2,:128], r3
    vld1.32     {d6-d7}, [r2,:128], r3
    vst1.32     {d0-d1}, [r0,:128], r1
    vst1.32     {d2-d3}, [r0,:128], r1
    vst1.32     {d4-d5}, [r0,:128], r1
    vst1.32     {d6-d7}, [r0,:128], r1
    bgt         copy_w16_aligned_loop
    bx          lr
.endfunc


// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
//                           uint8_t *src, int i_src_stride,
//                           int dx, int dy, int i_width, int i_height );
function x264_mc_chroma_neon
    push            {r4-r6, lr}
    ldrd            r4,  [sp, #16]
    ldr             r6,  [sp, #24]

    asr             lr,  r5,  #3
    mul             lr,  r3,  lr
    add             r2,  r2,  r4,  asr #3
    cmp             r6, #4
    add             r2,  r2,  lr

    and             r4, r4, #7
    and             r5, r5, #7
    pld             [r2]
    pld             [r2, r3]

    bgt             mc_chroma_w8
    beq             mc_chroma_w4

// calculate cA cB cC cD
.macro CHROMA_MC_START r0 r1
    muls            lr,  r4,  r5
    rsb             r6,  lr,  r5,  lsl #3
    rsb             ip,  lr,  r4,  lsl #3
    sub             r4,  lr,  r4,  lsl #3
    sub             r4,  r4,  r5,  lsl #3
    add             r4,  r4,  #64

    beq             2f

    add             r5,  r2,  r3

    vdup.8          d0,  r4
    lsl             r3,  r3,  #1
    vdup.8          d1,  ip
    vld1.64         {\r0}, [r2], r3
    vdup.8          d2,  r6
    vld1.64         {\r1}, [r5], r3
    vdup.8          d3,  lr
    ldr             r4,  [sp, #28]

    vext.8          d5,  d4,  d5,  #1
    vext.8          d7,  d6,  d7,  #1
.endm

.macro CHROMA_MC width, align
mc_chroma_w\width:
    CHROMA_MC_START d4,  d6
// since the element size varies, there's a different index for the 2nd store
.if \width == 4
    .set st2, 1
.else
    .set st2, 2
.endif

    vtrn.32         d4,  d5
    vtrn.32         d6,  d7

    vtrn.32         d0,  d1
    vtrn.32         d2,  d3

1:  // height loop, interpolate xy
    pld             [r5]
    vmull.u8        q8,  d4,  d0
    vmlal.u8        q8,  d6,  d2
    vld1.64         {d4},     [r2], r3
    vext.8          d5,  d4,  d5,  #1
    vtrn.32         d4,  d5
    vmull.u8        q9,  d6,  d0
    vmlal.u8        q9,  d4,  d2
    vld1.64         {d6},     [r5], r3
    vadd.i16        d16, d16, d17
    vadd.i16        d17, d18, d19
    vrshrn.u16      d16, q8,  #6
    subs            r4,  r4,  #2
    pld             [r2]
    vext.8          d7,  d6,  d7,  #1
    vtrn.32         d6,  d7
    vst1.\align     {d16[0]},   [r0,:\align], r1
    vst1.\align     {d16[st2]}, [r0,:\align], r1
    bgt             1b

    pop             {r4-r6, pc}

2:  // dx or dy are 0
    tst             r6,  r6
    add             ip,  ip,  r6
    vdup.8          d0,  r4
    vdup.8          d1,  ip
    vtrn.32         d0,  d1
    ldr             r4,  [sp, #28]

    beq             4f

    vext.32         d1,  d0,  d1,  #1
    add             r5,  r2,  r3
    lsl             r3,  r3,  #1
    vld1.32         {d4[0]},  [r2], r3
    vld1.32         {d4[1]},  [r5], r3

3:  // vertical interpolation loop
    pld             [r5]
    vmull.u8        q8,  d4,  d0
    vld1.32         {d4[0]},  [r2], r3
    vmull.u8        q9,  d4,  d1
    vld1.32         {d4[1]},  [r5], r3
    vadd.i16        d16, d16, d17
    vadd.i16        d17, d18, d19
    vrshrn.u16      d16, q8,  #6
    subs            r4,  r4,  #2
    pld             [r2]
    vst1.\align     {d16[0]},   [r0,:\align], r1
    vst1.\align     {d16[st2]}, [r0,:\align], r1
    bgt             3b

    pop             {r4-r6, pc}

4:  // dy is 0
    vld1.64         {d4},     [r2], r3
    vld1.64         {d6},     [r2], r3
    vext.8          d5,  d4,  d5,  #1
    vext.8          d7,  d6,  d7,  #1
    vtrn.32         d4,  d5
    vtrn.32         d6,  d7

5:  // horizontal interpolation loop
    vmull.u8        q8,  d4,  d0
    vmull.u8        q9,  d6,  d0
    subs            r4,  r4,  #2
    vld1.64         {d4},     [r2], r3
    vext.8          d5,  d4,  d5,  #1
    vtrn.32         d4,  d5
    vadd.i16        d16, d16, d17
    vadd.i16        d17, d18, d19
    pld             [r2]
    vrshrn.u16      d16, q8,  #6
    vld1.64         {d6},     [r2], r3
    vext.8          d7,  d6,  d7,  #1
    vtrn.32         d6,  d7
    pld             [r2]
    vst1.\align     {d16[0]},   [r0,:\align], r1
    vst1.\align     {d16[st2]}, [r0,:\align], r1
    bgt             5b

    pop             {r4-r6, pc}
.endm

    CHROMA_MC 2, 16
    CHROMA_MC 4, 32

// the optimial timing for width 8 is different enough that it's not
// readable to put it in the same macro as width 2/4
mc_chroma_w8:
    CHROMA_MC_START d4-d5, d6-d7

1:  // height loop, interpolate xy
    pld             [r5]
    vmull.u8        q8,  d4,  d0
    vmlal.u8        q8,  d5,  d1
    vld1.64         {d4, d5}, [r2], r3
    vmlal.u8        q8,  d6,  d2
    vext.8          d5,  d4,  d5,  #1
    vmlal.u8        q8,  d7,  d3
    vmull.u8        q9,  d6,  d0
    subs            r4,  r4,  #2
    vmlal.u8        q9,  d7,  d1
    vmlal.u8        q9,  d4,  d2
    vmlal.u8        q9,  d5,  d3
    vrshrn.u16      d16, q8,  #6
    vld1.64         {d6, d7}, [r5], r3
    pld             [r2]
    vrshrn.u16      d17, q9,  #6
    vext.8          d7,  d6,  d7,  #1
    vst1.64         {d16}, [r0,:64], r1
    vst1.64         {d17}, [r0,:64], r1
    bgt             1b

    pop             {r4-r6, pc}

2:  // dx or dy are 0
    tst             r6,  r6
    add             ip,  ip,  r6
    vdup.8          d0,  r4
    vdup.8          d1,  ip
    ldr             r4,  [sp, #28]

    beq             4f

    add             r5,  r2,  r3
    lsl             r3,  r3,  #1
    vld1.64         {d4}, [r2], r3
    vld1.64         {d6}, [r5], r3

3:  // vertical interpolation loop
    pld             [r5]
    vmull.u8        q8,  d4,  d0
    vmlal.u8        q8,  d6,  d1
    vld1.64         {d4}, [r2], r3
    vmull.u8        q9,  d6,  d0
    vmlal.u8        q9,  d4,  d1
    vld1.64         {d6}, [r5], r3
    vrshrn.u16      d16, q8,  #6
    vrshrn.u16      d17, q9,  #6
    subs            r4,  r4,  #2
    pld             [r2]
    vst1.64         {d16}, [r0,:64], r1
    vst1.64         {d17}, [r0,:64], r1
    bgt             3b

    pop             {r4-r6, pc}

4:  // dy is 0
    vld1.64         {d4, d5}, [r2], r3
    vld1.64         {d6, d7}, [r2], r3
    vext.8          d5,  d4,  d5,  #1
    vext.8          d7,  d6,  d7,  #1

5:  // horizontal interpolation loop
    pld             [r2]
    subs            r4,  r4,  #2
    vmull.u8        q8,  d4,  d0
    vmlal.u8        q8,  d5,  d1
    vld1.64         {d4,  d5}, [r2], r3
    vmull.u8        q9,  d6,  d0
    vmlal.u8        q9,  d7,  d1
    pld             [r2]
    vext.8          d5,  d4,  d5,  #1
    vrshrn.u16      d16, q8,  #6
    vrshrn.u16      d17, q9,  #6
    vld1.64         {d6, d7}, [r2], r3
    vext.8          d7,  d6,  d7,  #1
    vst1.64         {d16}, [r0,:64], r1
    vst1.64         {d17}, [r0,:64], r1
    bgt             5b

    pop             {r4-r6, pc}
.endfunc


// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
function x264_hpel_filter_v_neon
    ldr             ip,  [sp]
    sub             r1,  r1,  r3,  lsl #1
    push            {lr}
    add             lr,  r1,  ip
    vmov.u8         d30, #5
    vmov.u8         d31, #20

filter_v_loop:
    subs            ip,  ip,  #16
    vld1.64         {d0-d1},   [r1,:128], r3
    vld1.64         {d2-d3},   [r1,:128], r3
    vld1.64         {d4-d5},   [r1,:128], r3
    vld1.64         {d6-d7},   [r1,:128], r3
    vld1.64         {d16-d17}, [r1,:128], r3
    vld1.64         {d18-d19}, [r1,:128], r3
    sub             r1,  lr,  ip

    vaddl.u8        q10, d0,  d18
    vmlsl.u8        q10, d2,  d30
    vmlal.u8        q10, d4,  d31
    vmlal.u8        q10, d6,  d31
    vmlsl.u8        q10, d16, d30

    vaddl.u8        q11, d1,  d19
    vmlsl.u8        q11, d3,  d30
    vmlal.u8        q11, d5,  d31
    vmlal.u8        q11, d7,  d31
    vmlsl.u8        q11, d17, d30

    vqrshrun.s16    d0,  q10, #5
    vst1.64         {d20-d21}, [r2,:128]!
    vqrshrun.s16    d1,  q11, #5
    vst1.64         {d22-d23}, [r2,:128]!
    vst1.64         {d0-d1},   [r0,:128]!
    bgt             filter_v_loop
    pop             {pc}
.endfunc

// hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
function x264_hpel_filter_c_neon
    sub             r1,  #16
    vld1.64         {d0-d3}, [r1,:128]!

    // unrolled 2x: 4% faster
filter_c_loop:
    subs            r2,  r2,  #16
    vld1.64         {d4-d7}, [r1,:128]!
    vext.16         q8,  q0,  q1,  #6
    vext.16         q12, q1,  q2,  #3
    vadd.s16        q8,  q8,  q12
    vext.16         q9,  q0,  q1,  #7
    vext.16         q11, q1,  q2,  #2
    vadd.s16        q9,  q9,  q11
    vext.16         q10, q1,  q2,  #1
    vext.16         q11, q1,  q2,  #6
    vadd.s16        q10, q1,  q10
    vsub.s16        q8,  q8,  q9    // a-b
    vext.16         q15, q2,  q3,  #3
    vsub.s16        q9,  q9,  q10   // b-c

    vext.16         q12, q1,  q2,  #7
    vshr.s16        q8,  q8,  #2    // (a-b)/4
    vadd.s16        q11, q11, q15
    vext.16         q14, q2,  q3,  #2
    vsub.s16        q8,  q8,  q9    // (a-b)/4-b+c
    vadd.s16        q12, q12, q14
    vext.16         q13, q2,  q3,  #1

    vshr.s16        q8,  q8,  #2    // ((a-b)/4-b+c)/4
    vadd.s16        q13, q2,  q13
    vadd.s16        q8,  q8,  q10   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
    vsub.s16        q11, q11, q12   // a-b
    vsub.s16        q12, q12, q13   // b-c
    vshr.s16        q11, q11, #2    // (a-b)/4
    vqrshrun.s16    d30, q8,  #6
    vsub.s16        q11, q11, q12   // (a-b)/4-b+c
    vshr.s16        q11, q11, #2    // ((a-b)/4-b+c)/4
    vld1.64         {d0-d3}, [r1,:128]!
    vadd.s16        q11, q11, q13   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16

    vext.16         q8,  q2,  q3,  #6
    vqrshrun.s16    d31, q11,  #6
    vext.16         q12, q3,  q0,  #3
    vadd.s16        q8,  q8,  q12
    vext.16         q9,  q2,  q3,  #7
    vst1.64         {d30-d31}, [r0,:128]!
    bxle            lr
    subs            r2,  r2,  #16

    vext.16         q11, q3,  q0,  #2
    vadd.s16        q9,  q9,  q11
    vext.16         q10, q3,  q0,  #1
    vext.16         q11, q3,  q0,  #6
    vadd.s16        q10, q3,  q10
    vsub.s16        q8,  q8,  q9    // a-b
    vext.16         q15, q0,  q1,  #3
    vsub.s16        q9,  q9,  q10   // b-c

    vext.16         q12, q3,  q0,  #7
    vshr.s16        q8,  q8,  #2    // (a-b)/4
    vadd.s16        q11, q11, q15
    vext.16         q14, q0,  q1,  #2
    vsub.s16        q8,  q8,  q9    // (a-b)/4-b+c
    vadd.s16        q12, q12, q14
    vext.16         q13, q0,  q1,  #1

    vshr.s16        q8,  q8,  #2    // ((a-b)/4-b+c)/4
    vadd.s16        q13, q0,  q13
    vadd.s16        q8,  q8,  q10   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
    vsub.s16        q11, q11, q12   // a-b
    vsub.s16        q12, q12, q13   // b-c
    vshr.s16        q11, q11, #2    // (a-b)/4
    vqrshrun.s16    d30, q8,  #6
    vsub.s16        q11, q11, q12   // (a-b)/4-b+c
    vshr.s16        q11, q11, #2    // ((a-b)/4-b+c)/4
    vadd.s16        q11, q11, q13   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16

    vqrshrun.s16    d31, q11,  #6
    vst1.64         {d30-d31}, [r0,:128]!
    bgt             filter_c_loop
    bx              lr
.endfunc

// hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
function x264_hpel_filter_h_neon
    sub             r1,  #16
    vmov.u8         d30, #5
    vld1.64         {d0-d3}, [r1,:128]!
    vmov.u8         d31, #20

    // unrolled 3x because it's 5% faster, due to mitigating
    // the high latency of multiplication and vqrshrun
filter_h_loop:
    subs            r2,  r2,  #16
    vld1.64         {d4-d5}, [r1,:128]!
    vext.8          q8,  q0,  q1,  #14
    vext.8          q12, q1,  q2,  #3
    vaddl.u8        q13, d16, d24
    vext.8          q9,  q0,  q1,  #15
    vaddl.u8        q14, d17, d25

    vext.8          q10, q1,  q2,  #1
    vmlal.u8        q13, d2,  d31
    vmlsl.u8        q13, d18, d30
    vext.8          q11, q1,  q2,  #2
    vmlal.u8        q13, d20, d31
    vmlsl.u8        q13, d22, d30

    vmlsl.u8        q14, d19, d30
    vmlal.u8        q14, d3,  d31
    vmlal.u8        q14, d21, d31
    vmlsl.u8        q14, d23, d30
    vqrshrun.s16    d6,  q13, #5

    vld1.64         {d0-d1}, [r1,:128]!
    vext.8          q8,  q1,  q2,  #14
    vext.8          q12, q2,  q0,  #3
    vaddl.u8        q13, d16, d24
    vqrshrun.s16    d7,  q14, #5
    vext.8          q9,  q1,  q2,  #15
    vaddl.u8        q14, d17, d25

    vst1.64         {d6-d7}, [r0,:128]!
    bxle            lr
    subs            r2,  r2,  #16

    vext.8          q10, q2,  q0,  #1
    vmlal.u8        q13, d4,  d31
    vmlsl.u8        q13, d18, d30
    vext.8          q11, q2,  q0,  #2
    vmlal.u8        q13, d20, d31
    vmlsl.u8        q13, d22, d30

    vmlsl.u8        q14, d19, d30
    vmlal.u8        q14, d5,  d31
    vmlal.u8        q14, d21, d31
    vmlsl.u8        q14, d23, d30
    vqrshrun.s16    d6,  q13, #5

    vld1.64         {d2-d3}, [r1,:128]!
    vext.8          q8,  q2,  q0,  #14
    vext.8          q12, q0,  q1,  #3
    vaddl.u8        q13, d16, d24
    vqrshrun.s16    d7,  q14, #5
    vext.8          q9,  q2,  q0,  #15
    vaddl.u8        q14, d17, d25

    vst1.64         {d6-d7}, [r0,:128]!
    bxle            lr
    subs            r2,  r2,  #16

    vext.8          q10, q0,  q1,  #1
    vmlal.u8        q13, d0,  d31
    vmlsl.u8        q13, d18, d30
    vext.8          q11, q0,  q1,  #2
    vmlal.u8        q13, d20, d31
    vmlsl.u8        q13, d22, d30

    vmlsl.u8        q14, d19, d30
    vmlal.u8        q14, d1,  d31
    vmlal.u8        q14, d21, d31
    vmlsl.u8        q14, d23, d30

    vqrshrun.s16    d6, q13, #5
    vqrshrun.s16    d7, q14, #5
    vst1.64         {d6-d7}, [r0,:128]!
    bgt             filter_h_loop
    bx              lr
.endfunc


// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
//                         uint8_t *dstc, int src_stride, int dst_stride, int width,
//                         int height )
function x264_frame_init_lowres_core_neon
    push            {r4-r10,lr}
    vpush           {d8-d15}
    ldrd            r4,  [sp, #96]
    ldrd            r6,  [sp, #104]
    ldr             lr,  [sp, #112]
    sub             r10, r6,  r7            // dst_stride - width
    and             r10, r10, #~15

lowres_yloop:
    mov             ip,  r7                 // width
    mov             r6,  r0                 // src0
    add             r8,  r0,  r5            // src1 = src0 + src_stride
    add             r9,  r0,  r5,  lsl #1   // src2 = src1 + src_stride

    vld2.8          {d8, d10}, [r6,:128]!
    vld2.8          {d12,d14}, [r8,:128]!
    vld2.8          {d16,d18}, [r9,:128]!

lowres_xloop:
    subs            ip,  ip,  #16

    vld2.8          {d9, d11}, [r6,:128]!
    vld2.8          {d13,d15}, [r8,:128]!
    vrhadd.u8       q0,  q4,  q6
    vld2.8          {d17,d19}, [r9,:128]!
    vrhadd.u8       q5,  q5,  q7
    vld2.8          {d20,d22}, [r6,:128]!
    vrhadd.u8       q1,  q6,  q8
    vld2.8          {d24,d26}, [r8,:128]!
    vrhadd.u8       q7,  q7,  q9
    vext.8          q4,  q4,  q10, #1
    vrhadd.u8       q0,  q0,  q5
    vext.8          q6,  q6,  q12, #1
    vrhadd.u8       q1,  q1,  q7
    vld2.8          {d28,d30}, [r9,:128]!
    vrhadd.u8       q4,  q4,  q6
    vext.8          q8,  q8,  q14, #1
    vrhadd.u8       q6,  q6,  q8
    vst1.64         {d0-d1},   [r1,:128]!
    vrhadd.u8       q2,  q4,  q5
    vst1.64         {d2-d3},   [r3,:128]!
    vrhadd.u8       q3,  q6,  q7
    vst1.64         {d4-d5},   [r2,:128]!
    vst1.64         {d6-d7},   [r4,:128]!

    ble             lowres_xloop_end
    subs            ip,  ip,  #16

    vld2.8          {d21,d23}, [r6,:128]!
    vld2.8          {d25,d27}, [r8,:128]!
    vrhadd.u8       q0,  q10, q12
    vld2.8          {d29,d31}, [r9,:128]!
    vrhadd.u8       q11, q11, q13
    vld2.8          {d8, d10}, [r6,:128]!
    vrhadd.u8       q1,  q12, q14
    vld2.8          {d12,d14}, [r8,:128]!
    vrhadd.u8       q13, q13, q15
    vext.8          q10, q10, q4,  #1
    vrhadd.u8       q0,  q0,  q11
    vext.8          q12, q12, q6,  #1
    vrhadd.u8       q1,  q1,  q13
    vld2.8          {d16,d18}, [r9,:128]!
    vrhadd.u8       q10, q10, q12
    vext.8          q14, q14, q8,  #1
    vrhadd.u8       q12, q12, q14
    vst1.64         {d0-d1},   [r1,:128]!
    vrhadd.u8       q2,  q10, q11
    vst1.64         {d2-d3},   [r3,:128]!
    vrhadd.u8       q3,  q12, q13
    vst1.64         {d4-d5},   [r2,:128]!
    vst1.64         {d6-d7},   [r4,:128]!

    bgt             lowres_xloop

lowres_xloop_end:
    subs            lr,  lr,  #1
    add             r0,  r0,  r5,  lsl #1
    add             r1,  r1,  r10
    add             r2,  r2,  r10
    add             r3,  r3,  r10
    add             r4,  r4,  r10
    bgt             lowres_yloop

    vpop            {d8-d15}
    pop             {r4-r10,pc}
.endfunc
x264-snapshot-20120103-2245-stable/common/arm/deblock-a.S0000644000175000001440000002256711700673342021721 0ustar  videolanusers/*****************************************************************************
 * deblock.S: arm deblocking
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: Mans Rullgard <mans@mansr.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "asm.S"

.fpu neon

.macro h264_loop_filter_start
    ldr             ip,  [sp]
    ldr             ip,  [ip]
    vdup.32         d24, ip
    and             ip,  ip,  ip, lsl #16
    ands            ip,  ip,  ip, lsl #8
    bxlt            lr
.endm

.macro align_push_regs
    and             ip,  sp,  #15
    add             ip,  ip,  #32
    sub             sp,  sp,  ip
    vst1.64         {d12-d15}, [sp,:128]
    sub             sp,  sp,  #32
    vst1.64         {d8-d11},  [sp,:128]
.endm

.macro align_pop_regs
    vld1.64         {d8-d11},  [sp,:128]!
    vld1.64         {d12-d15}, [sp,:128], ip
.endm

.macro h264_loop_filter_luma
    vdup.8          q11, r2         @ alpha
    vmovl.u8        q12, d24
    vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
    vmovl.u16       q12, d24
    vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
    vsli.16         q12, q12, #8
    vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
    vsli.32         q12, q12, #16
    vclt.u8         q6,  q6,  q11   @ < alpha
    vdup.8          q11, r3         @ beta
    vclt.s8         q7,  q12, #0
    vclt.u8         q14, q14, q11   @ < beta
    vclt.u8         q15, q15, q11   @ < beta
    vbic            q6,  q6,  q7
    vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
    vand            q6,  q6,  q14
    vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
    vclt.u8         q4,  q4,  q11   @ < beta
    vand            q6,  q6,  q15
    vclt.u8         q5,  q5,  q11   @ < beta
    vand            q4,  q4,  q6
    vand            q5,  q5,  q6
    vand            q12, q12, q6
    vrhadd.u8       q14, q8,  q0
    vsub.i8         q6,  q12, q4
    vqadd.u8        q7,  q9,  q12
    vhadd.u8        q10, q10, q14
    vsub.i8         q6,  q6,  q5
    vhadd.u8        q14, q2,  q14
    vmin.u8         q7,  q7,  q10
    vqsub.u8        q11, q9,  q12
    vqadd.u8        q2,  q1,  q12
    vmax.u8         q7,  q7,  q11
    vqsub.u8        q11, q1,  q12
    vmin.u8         q14, q2,  q14
    vmovl.u8        q2,  d0
    vmax.u8         q14, q14, q11
    vmovl.u8        q10, d1
    vsubw.u8        q2,  q2,  d16
    vsubw.u8        q10, q10, d17
    vshl.i16        q2,  q2,  #2
    vshl.i16        q10, q10, #2
    vaddw.u8        q2,  q2,  d18
    vaddw.u8        q10, q10, d19
    vsubw.u8        q2,  q2,  d2
    vsubw.u8        q10, q10, d3
    vrshrn.i16      d4,  q2,  #3
    vrshrn.i16      d5,  q10, #3
    vbsl            q4,  q7,  q9
    vbsl            q5,  q14, q1
    vneg.s8         q7,  q6
    vmovl.u8        q14, d16
    vmin.s8         q2,  q2,  q6
    vmovl.u8        q6,  d17
    vmax.s8         q2,  q2,  q7
    vmovl.u8        q11, d0
    vmovl.u8        q12, d1
    vaddw.s8        q14, q14, d4
    vaddw.s8        q6,  q6,  d5
    vsubw.s8        q11, q11, d4
    vsubw.s8        q12, q12, d5
    vqmovun.s16     d16, q14
    vqmovun.s16     d17, q6
    vqmovun.s16     d0,  q11
    vqmovun.s16     d1,  q12
.endm

function x264_deblock_v_luma_neon
    h264_loop_filter_start

    vld1.64         {d0, d1},  [r0,:128], r1
    vld1.64         {d2, d3},  [r0,:128], r1
    vld1.64         {d4, d5},  [r0,:128], r1
    sub             r0,  r0,  r1, lsl #2
    sub             r0,  r0,  r1, lsl #1
    vld1.64         {d20,d21}, [r0,:128], r1
    vld1.64         {d18,d19}, [r0,:128], r1
    vld1.64         {d16,d17}, [r0,:128], r1

    align_push_regs

    h264_loop_filter_luma

    sub             r0,  r0,  r1, lsl #1
    vst1.64         {d8, d9},  [r0,:128], r1
    vst1.64         {d16,d17}, [r0,:128], r1
    vst1.64         {d0, d1},  [r0,:128], r1
    vst1.64         {d10,d11}, [r0,:128]

    align_pop_regs
    bx              lr
.endfunc

function x264_deblock_h_luma_neon
    h264_loop_filter_start

    sub             r0,  r0,  #4
    vld1.64         {d6},  [r0], r1
    vld1.64         {d20}, [r0], r1
    vld1.64         {d18}, [r0], r1
    vld1.64         {d16}, [r0], r1
    vld1.64         {d0},  [r0], r1
    vld1.64         {d2},  [r0], r1
    vld1.64         {d4},  [r0], r1
    vld1.64         {d26}, [r0], r1
    vld1.64         {d7},  [r0], r1
    vld1.64         {d21}, [r0], r1
    vld1.64         {d19}, [r0], r1
    vld1.64         {d17}, [r0], r1
    vld1.64         {d1},  [r0], r1
    vld1.64         {d3},  [r0], r1
    vld1.64         {d5},  [r0], r1
    vld1.64         {d27}, [r0], r1

    TRANSPOSE8x8    q3, q10, q9, q8, q0, q1, q2, q13

    align_push_regs

    h264_loop_filter_luma

    TRANSPOSE4x4    q4, q8, q0, q5

    sub             r0,  r0,  r1, lsl #4
    add             r0,  r0,  #2
    vst1.32         {d8[0]},  [r0], r1
    vst1.32         {d16[0]}, [r0], r1
    vst1.32         {d0[0]},  [r0], r1
    vst1.32         {d10[0]}, [r0], r1
    vst1.32         {d8[1]},  [r0], r1
    vst1.32         {d16[1]}, [r0], r1
    vst1.32         {d0[1]},  [r0], r1
    vst1.32         {d10[1]}, [r0], r1
    vst1.32         {d9[0]},  [r0], r1
    vst1.32         {d17[0]}, [r0], r1
    vst1.32         {d1[0]},  [r0], r1
    vst1.32         {d11[0]}, [r0], r1
    vst1.32         {d9[1]},  [r0], r1
    vst1.32         {d17[1]}, [r0], r1
    vst1.32         {d1[1]},  [r0], r1
    vst1.32         {d11[1]}, [r0], r1

    align_pop_regs
    bx              lr
.endfunc

.macro h264_loop_filter_chroma
    vdup.8          q11, r2         // alpha
    vmovl.u8        q12, d24
    vabd.u8         q13, q8,  q0    // abs(p0 - q0)
    vabd.u8         q14, q9,  q8    // abs(p1 - p0)
    vsubl.u8        q2,  d0,  d16
    vsubl.u8        q3,  d1,  d17
    vsli.16         q12, q12, #8
    vshl.i16        q2,  q2,  #2
    vshl.i16        q3,  q3,  #2
    vabd.u8         q15, q1,  q0    // abs(q1 - q0)
    vaddw.u8        q2,  q2,  d18
    vaddw.u8        q3,  q3,  d19
    vclt.u8         q13, q13, q11   // < alpha
    vsubw.u8        q2,  q2,  d2
    vsubw.u8        q3,  q3,  d3
    vdup.8          q11, r3         // beta
    vclt.s8         q10, q12, #0
    vrshrn.i16      d4,  q2,  #3
    vrshrn.i16      d5,  q3,  #3
    vclt.u8         q14, q14, q11   // < beta
    vbic            q13, q13, q10
    vclt.u8         q15, q15, q11   // < beta
    vand            q13, q13, q14
    vneg.s8         q10, q12
    vand            q13, q13, q15
    vmin.s8         q2,  q2,  q12
    vmovl.u8        q14, d16
    vand            q2,  q2,  q13
    vmovl.u8        q15, d17
    vmax.s8         q2,  q2,  q10
    vmovl.u8        q11, d0
    vmovl.u8        q12, d1
    vaddw.s8        q14, q14, d4
    vaddw.s8        q15, q15, d5
    vsubw.s8        q11, q11, d4
    vsubw.s8        q12, q12, d5
    vqmovun.s16     d16, q14
    vqmovun.s16     d17, q15
    vqmovun.s16     d0,  q11
    vqmovun.s16     d1,  q12
.endm

function x264_deblock_v_chroma_neon
    h264_loop_filter_start

    sub             r0,  r0,  r1, lsl #1
    vld2.8          {d18,d19}, [r0,:128], r1
    vld2.8          {d16,d17}, [r0,:128], r1
    vld2.8          {d0, d1},  [r0,:128], r1
    vld2.8          {d2, d3},  [r0,:128]

    h264_loop_filter_chroma

    sub             r0,  r0,  r1, lsl #1
    vst2.8          {d16,d17}, [r0,:128], r1
    vst2.8          {d0, d1},  [r0,:128], r1

    bx              lr
.endfunc

function x264_deblock_h_chroma_neon
    h264_loop_filter_start

    sub             r0,  r0,  #4
    vld1.8          {d18}, [r0], r1
    vld1.8          {d16}, [r0], r1
    vld1.8          {d0},  [r0], r1
    vld1.8          {d2},  [r0], r1
    vld1.8          {d19}, [r0], r1
    vld1.8          {d17}, [r0], r1
    vld1.8          {d1},  [r0], r1
    vld1.8          {d3},  [r0], r1

    vuzp.8          d18, d19
    vuzp.8          d16, d17
    vuzp.8          d0,  d1
    vuzp.8          d2,  d3

    vtrn.16         q9,  q0
    vtrn.16         q8,  q1
    vtrn.8          q9,  q8
    vtrn.8          q0,  q1

    h264_loop_filter_chroma

    vtrn.16         q9,  q0
    vtrn.16         q8,  q1
    vtrn.8          q9,  q8
    vtrn.8          q0,  q1

    vzip.8          d18, d19
    vzip.8          d16, d17
    vzip.8          d0,  d1
    vzip.8          d2,  d3

    sub             r0,  r0,  r1, lsl #3
    vst1.8          {d18}, [r0], r1
    vst1.8          {d16}, [r0], r1
    vst1.8          {d0},  [r0], r1
    vst1.8          {d2},  [r0], r1
    vst1.8          {d19}, [r0], r1
    vst1.8          {d17}, [r0], r1
    vst1.8          {d1},  [r0], r1
    vst1.8          {d3},  [r0], r1

    bx              lr
.endfunc
x264-snapshot-20120103-2245-stable/common/arm/dct.h0000644000175000001440000000451011700673342020663 0ustar  videolanusers/*****************************************************************************
 * dct.h: arm transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#ifndef X264_ARM_DCT_H
#define X264_ARM_DCT_H

void x264_dct4x4dc_neon( int16_t d[16] );
void x264_idct4x4dc_neon( int16_t d[16] );

void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );

void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );

void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );

void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );

void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );

void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );

#endif
x264-snapshot-20120103-2245-stable/common/arm/dct-a.S0000644000175000001440000005067711700673342021073 0ustar  videolanusers/****************************************************************************
 * dct-a.S: arm transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "asm.S"

.fpu neon

.section .rodata
.align 4

scan4x4_frame:
.byte    0,1,   8,9,   2,3,   4,5
.byte    2,3,   8,9,  16,17, 10,11
.byte   12,13,  6,7,  14,15, 20,21
.byte   10,11, 12,13,  6,7,  14,15

.text

// sum = a + (b>>shift)   sub = (a>>shift) - b
.macro SUMSUB_SHR shift sum sub a b t0 t1
    vshr.s16    \t0,  \b, #\shift
    vshr.s16    \t1,  \a, #\shift
    vadd.s16    \sum, \a, \t0
    vsub.s16    \sub, \t1, \b
.endm

// sum = (a>>shift) + b   sub = a - (b>>shift)
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
    vshr.s16    \t0,  \a, #\shift
    vshr.s16    \t1,  \b, #\shift
    vadd.s16    \sum, \t0, \b
    vsub.s16    \sub, \a, \t1
.endm

// a += 1.5*ma   b -= 1.5*mb
.macro SUMSUB_15 a b ma mb t0 t1
    vshr.s16    \t0, \ma, #1
    vshr.s16    \t1, \mb, #1
    vadd.s16    \t0, \t0, \ma
    vadd.s16    \t1, \t1, \mb
    vadd.s16    \a,  \a,  \t0
    vsub.s16    \b,  \b,  \t1
.endm


function x264_dct4x4dc_neon
    vld1.64         {d0-d3}, [r0,:128]
    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7

    vmov.s16        d31, #1
    HADAMARD        1, sumsub, q2, q3, q0, q1
    vtrn.32         d4,  d5
    vadd.s16        d16, d4,  d31
    vtrn.32         d6,  d7
    vadd.s16        d17, d6,  d31
    vrhadd.s16      d0,  d4,  d5
    vhsub.s16       d1,  d16, d5
    vhsub.s16       d2,  d17, d7
    vrhadd.s16      d3,  d6,  d7
    vst1.64         {d0-d3}, [r0,:128]
    bx              lr
.endfunc

function x264_idct4x4dc_neon
    vld1.64         {d0-d3}, [r0,:128]
    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7

    HADAMARD        1, sumsub, q2, q3, q0, q1
    HADAMARD        2, sumsub, d0, d1, d4, d5
    HADAMARD        2, sumsub, d3, d2, d6, d7
    vst1.64         {d0-d3}, [r0,:128]
    bx              lr
.endfunc


.macro DCT_1D d0 d1 d2 d3  d4 d5 d6 d7
    SUMSUB_AB       \d1, \d6, \d5, \d6
    SUMSUB_AB       \d3, \d7, \d4, \d7
    vadd.s16        \d0, \d3, \d1
    vadd.s16        \d4, \d7, \d7
    vadd.s16        \d5, \d6, \d6
    vsub.s16        \d2, \d3, \d1
    vadd.s16        \d1, \d4, \d6
    vsub.s16        \d3, \d7, \d5
.endm

function x264_sub4x4_dct_neon
    mov             r3, #FENC_STRIDE
    mov             ip, #FDEC_STRIDE
    vld1.32         {d0[]}, [r1,:32], r3
    vld1.32         {d1[]}, [r2,:32], ip
    vld1.32         {d2[]}, [r1,:32], r3
    vsubl.u8        q8,  d0,  d1
    vld1.32         {d3[]}, [r2,:32], ip
    vld1.32         {d4[]}, [r1,:32], r3
    vsubl.u8        q9,  d2,  d3
    vld1.32         {d5[]}, [r2,:32], ip
    vld1.32         {d6[]}, [r1,:32], r3
    vsubl.u8        q10, d4,  d5
    vld1.32         {d7[]}, [r2,:32], ip
    vsubl.u8        q11, d6,  d7

    DCT_1D          d0, d1, d2, d3, d16, d18, d20, d22
    TRANSPOSE4x4_16 d0, d1, d2, d3
    DCT_1D          d4, d5, d6, d7, d0, d1, d2, d3
    vst1.64         {d4-d7}, [r0,:128]
    bx              lr
.endfunc

function x264_sub8x4_dct_neon
    vld1.64         {d0}, [r1,:64], r3
    vld1.64         {d1}, [r2,:64], ip
    vsubl.u8        q8,  d0,  d1
    vld1.64         {d2}, [r1,:64], r3
    vld1.64         {d3}, [r2,:64], ip
    vsubl.u8        q9,  d2,  d3
    vld1.64         {d4}, [r1,:64], r3
    vld1.64         {d5}, [r2,:64], ip
    vsubl.u8        q10, d4,  d5
    vld1.64         {d6}, [r1,:64], r3
    vld1.64         {d7}, [r2,:64], ip
    vsubl.u8        q11, d6,  d7

    DCT_1D          q0, q1, q2, q3,  q8, q9, q10, q11
    TRANSPOSE4x4_16 q0, q1, q2, q3

    SUMSUB_AB       q8,  q12, q0,  q3
    SUMSUB_AB       q9,  q10, q1,  q2
    vadd.i16        q13, q12, q12
    vadd.i16        q11, q10, q10
    vadd.i16        d0,  d16, d18
    vadd.i16        d1,  d26, d20
    vsub.i16        d2,  d16, d18
    vsub.i16        d3,  d24, d22
    vst1.64         {d0-d1}, [r0,:128]!
    vadd.i16        d4,  d17, d19
    vadd.i16        d5,  d27, d21
    vst1.64         {d2-d3}, [r0,:128]!
    vsub.i16        d6,  d17, d19
    vsub.i16        d7,  d25, d23
    vst1.64         {d4-d5}, [r0,:128]!
    vst1.64         {d6-d7}, [r0,:128]!
    bx              lr
.endfunc

function x264_sub8x8_dct_neon
    push            {lr}
    mov             r3, #FENC_STRIDE
    mov             ip, #FDEC_STRIDE
    bl              x264_sub8x4_dct_neon
    pop             {lr}
    b               x264_sub8x4_dct_neon
.endfunc

function x264_sub16x16_dct_neon
    push            {lr}
    mov             r3, #FENC_STRIDE
    mov             ip, #FDEC_STRIDE
    bl              x264_sub8x4_dct_neon
    bl              x264_sub8x4_dct_neon
    sub             r1, r1, #8*FENC_STRIDE-8
    sub             r2, r2, #8*FDEC_STRIDE-8
    bl              x264_sub8x4_dct_neon
    bl              x264_sub8x4_dct_neon
    sub             r1, r1, #8
    sub             r2, r2, #8
    bl              x264_sub8x4_dct_neon
    bl              x264_sub8x4_dct_neon
    sub             r1, r1, #8*FENC_STRIDE-8
    sub             r2, r2, #8*FDEC_STRIDE-8
    bl              x264_sub8x4_dct_neon
    pop             {lr}
    b               x264_sub8x4_dct_neon
.endfunc


.macro DCT8_1D type
    SUMSUB_AB       q2,  q1,  q11, q12  // s34/d34
    SUMSUB_AB       q3,  q11, q10, q13  // s25/d25
    SUMSUB_AB       q13, q10, q9,  q14  // s16/d16
    SUMSUB_AB       q14, q8,  q8,  q15  // s07/d07

    SUMSUB_AB       q9,  q2,  q14, q2   // a0/a2
    SUMSUB_AB       q12, q14, q13, q3   // a1/a3

    SUMSUB_AB       q3,  q13, q8,  q1   // a6/a5
    vshr.s16        q0,  q10, #1
    vshr.s16        q15, q11, #1
    vadd.s16        q0,  q0,  q10
    vadd.s16        q15, q15, q11
    vsub.s16        q3,  q3,  q0
    vsub.s16        q13, q13, q15

    SUMSUB_AB       q0,  q15, q10, q11  // a4/a7
    vshr.s16        q10, q8,  #1
    vshr.s16        q11, q1,  #1
    vadd.s16        q10, q10, q8
    vadd.s16        q11, q11, q1
    vadd.s16        q10, q0,  q10
    vadd.s16        q15, q15, q11

    SUMSUB_AB       q8,  q12, q9,  q12
    SUMSUB_SHR   2, q9,  q15, q10, q15,  q0, q1
    SUMSUB_SHR   1, q10, q14, q2,  q14,  q0, q1
    SUMSUB_SHR2  2, q11, q13, q3,  q13,  q0, q1
.endm

function x264_sub8x8_dct8_neon
    mov             r3, #FENC_STRIDE
    mov             ip, #FDEC_STRIDE
    vld1.64         {d16}, [r1,:64], r3
    vld1.64         {d17}, [r2,:64], ip
    vsubl.u8        q8,  d16, d17
    vld1.64         {d18}, [r1,:64], r3
    vld1.64         {d19}, [r2,:64], ip
    vsubl.u8        q9,  d18, d19
    vld1.64         {d20}, [r1,:64], r3
    vld1.64         {d21}, [r2,:64], ip
    vsubl.u8        q10, d20, d21
    vld1.64         {d22}, [r1,:64], r3
    vld1.64         {d23}, [r2,:64], ip
    vsubl.u8        q11, d22, d23
    vld1.64         {d24}, [r1,:64], r3
    vld1.64         {d25}, [r2,:64], ip
    vsubl.u8        q12, d24, d25
    vld1.64         {d26}, [r1,:64], r3
    vld1.64         {d27}, [r2,:64], ip
    vsubl.u8        q13, d26, d27
    vld1.64         {d28}, [r1,:64], r3
    vld1.64         {d29}, [r2,:64], ip
    vsubl.u8        q14, d28, d29
    vld1.64         {d30}, [r1,:64], r3
    vld1.64         {d31}, [r2,:64], ip
    vsubl.u8        q15, d30, d31

    DCT8_1D row
    vswp            d17, d24    // 8, 12
    vswp            d21, d28    // 10,14
    vtrn.32         q8,  q10
    vtrn.32         q12, q14

    vswp            d19, d26    // 9, 13
    vswp            d23, d30    // 11,15
    vtrn.32         q9,  q11
    vtrn.32         q13, q15

    vtrn.16         q10, q11
    vtrn.16         q12, q13
    vtrn.16         q8,  q9
    vtrn.16         q14, q15
    DCT8_1D col

    vst1.64         {d16-d19}, [r0,:128]!
    vst1.64         {d20-d23}, [r0,:128]!
    vst1.64         {d24-d27}, [r0,:128]!
    vst1.64         {d28-d31}, [r0,:128]!
    bx              lr
.endfunc

function x264_sub16x16_dct8_neon
    push            {lr}
    bl              x264_sub8x8_dct8_neon
    sub             r1,  r1,  #FENC_STRIDE*8 - 8
    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
    bl              x264_sub8x8_dct8_neon
    sub             r1,  r1,  #8
    sub             r2,  r2,  #8
    bl              x264_sub8x8_dct8_neon
    pop             {lr}
    sub             r1,  r1,  #FENC_STRIDE*8 - 8
    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
    b               x264_sub8x8_dct8_neon
.endfunc


// First part of IDCT (minus final SUMSUB_BA)
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
    SUMSUB_AB       \d4, \d5, \d0, \d2
    vshr.s16        \d7, \d1, #1
    vshr.s16        \d6, \d3, #1
    vsub.s16        \d7, \d7, \d3
    vadd.s16        \d6, \d6, \d1
.endm

function x264_add4x4_idct_neon
    mov             r2, #FDEC_STRIDE
    vld1.64         {d0-d3}, [r1,:128]

    IDCT_1D         d4, d5, d6, d7, d0, d1, d2, d3
    vld1.32         {d30[0]}, [r0,:32], r2
    SUMSUB_AB       q0, q1, q2, q3

    TRANSPOSE4x4_16 d0, d1, d3, d2

    IDCT_1D         d4, d5, d6, d7, d0, d1, d3, d2
    vld1.32         {d30[1]}, [r0,:32], r2
    SUMSUB_AB       q0, q1, q2, q3

    vrshr.s16       q0, q0, #6
    vld1.32         {d31[1]}, [r0,:32], r2
    vrshr.s16       q1, q1, #6
    vld1.32         {d31[0]}, [r0,:32], r2

    sub             r0, r0, r2, lsl #2
    vaddw.u8        q0, q0, d30
    vaddw.u8        q1, q1, d31
    vqmovun.s16     d0, q0
    vqmovun.s16     d2, q1

    vst1.32         {d0[0]}, [r0,:32], r2
    vst1.32         {d0[1]}, [r0,:32], r2
    vst1.32         {d2[1]}, [r0,:32], r2
    vst1.32         {d2[0]}, [r0,:32], r2
    bx              lr
.endfunc

function x264_add8x4_idct_neon
    vld1.64         {d0-d3}, [r1,:128]!
    IDCT_1D         d16, d18, d20, d22, d0, d1, d2, d3
    vld1.64         {d4-d7}, [r1,:128]!
    IDCT_1D         d17, d19, d21, d23, d4, d5, d6, d7
    SUMSUB_AB       q0,  q3,  q8,  q10
    SUMSUB_AB       q1,  q2,  q9,  q11

    TRANSPOSE4x4_16 q0,  q1,  q2,  q3

    IDCT_1D         q8,  q9,  q10, q11, q0, q1, q2, q3
    SUMSUB_AB       q0,  q3,  q8,  q10
    SUMSUB_AB       q1,  q2,  q9,  q11

    vrshr.s16       q0,  q0,  #6
    vld1.32         {d28}, [r0,:64], r2
    vrshr.s16       q1,  q1,  #6
    vld1.32         {d29}, [r0,:64], r2
    vrshr.s16       q2,  q2,  #6
    vld1.32         {d30}, [r0,:64], r2
    vrshr.s16       q3,  q3,  #6
    vld1.32         {d31}, [r0,:64], r2

    sub             r0,  r0,  r2,  lsl #2
    vaddw.u8        q0,  q0,  d28
    vaddw.u8        q1,  q1,  d29
    vaddw.u8        q2,  q2,  d30
    vaddw.u8        q3,  q3,  d31

    vqmovun.s16     d0,  q0
    vqmovun.s16     d1,  q1
    vst1.32         {d0}, [r0,:64], r2
    vqmovun.s16     d2,  q2
    vst1.32         {d1}, [r0,:64], r2
    vqmovun.s16     d3,  q3
    vst1.32         {d2}, [r0,:64], r2
    vst1.32         {d3}, [r0,:64], r2
    bx              lr
.endfunc

function x264_add8x8_idct_neon
    mov             r2, #FDEC_STRIDE
    mov             ip, lr
    bl              x264_add8x4_idct_neon
    mov             lr, ip
    b               x264_add8x4_idct_neon
.endfunc

function x264_add16x16_idct_neon
    mov             r2, #FDEC_STRIDE
    mov             ip, lr
    bl              x264_add8x4_idct_neon
    bl              x264_add8x4_idct_neon
    sub             r0, r0, #8*FDEC_STRIDE-8
    bl              x264_add8x4_idct_neon
    bl              x264_add8x4_idct_neon
    sub             r0, r0, #8
    bl              x264_add8x4_idct_neon
    bl              x264_add8x4_idct_neon
    sub             r0, r0, #8*FDEC_STRIDE-8
    bl              x264_add8x4_idct_neon
    mov             lr, ip
    b               x264_add8x4_idct_neon
.endfunc


.macro IDCT8_1D type
.ifc \type, col
    vswp            d21, d28
.endif
    SUMSUB_AB       q0,  q1,  q8,  q12              // a0/a2
.ifc \type, row
    vld1.64         {d28-d31}, [r1,:128]!
.else
    vswp            d19, d26
.endif
    SUMSUB_SHR   1, q2,  q3,  q10, q14,  q8, q12    // a6/a4
.ifc \type, col
    vswp            d23, d30
.endif
    SUMSUB_AB       q8,  q10, q13, q11
    SUMSUB_15       q8,  q10, q9,  q15,  q12, q14   // a7/a1
    SUMSUB_AB       q14, q15, q15, q9
    SUMSUB_15       q15, q14, q13, q11,  q12, q9    // a5/a3

    SUMSUB_SHR   2, q13, q14, q14, q15,  q11, q9    // b3/b5
    SUMSUB_SHR2  2, q12, q15, q8,  q10,  q11, q9    // b1/b7

    SUMSUB_AB       q10, q2,  q0,  q2               // b0/b6
    SUMSUB_AB       q11, q3,  q1,  q3               // b2/b4

    SUMSUB_AB       q8,  q15, q10, q15
    SUMSUB_AB       q9,  q14, q11, q14
    SUMSUB_AB       q10, q13, q3,  q13
.ifc \type, row
    vtrn.16         q8,  q9
.endif
    SUMSUB_AB       q11, q12, q2,  q12
.endm

function x264_add8x8_idct8_neon
    mov             r2,  #FDEC_STRIDE
    vld1.64         {d16-d19}, [r1,:128]!
    vld1.64         {d20-d23}, [r1,:128]!
    vld1.64         {d24-d27}, [r1,:128]!

    IDCT8_1D row
    vtrn.16         q10, q11
    vtrn.16         q12, q13
    vtrn.16         q14, q15
    vtrn.32         q8,  q10
    vtrn.32         q9,  q11
    vtrn.32         q12, q14
    vtrn.32         q13, q15
    vswp            d17, d24
    IDCT8_1D col

    vld1.64         {d0}, [r0,:64], r2
    vrshr.s16       q8,  q8,  #6
    vld1.64         {d1}, [r0,:64], r2
    vrshr.s16       q9,  q9,  #6
    vld1.64         {d2}, [r0,:64], r2
    vrshr.s16       q10, q10, #6
    vld1.64         {d3}, [r0,:64], r2
    vrshr.s16       q11, q11, #6
    vld1.64         {d4}, [r0,:64], r2
    vrshr.s16       q12, q12, #6
    vld1.64         {d5}, [r0,:64], r2
    vrshr.s16       q13, q13, #6
    vld1.64         {d6}, [r0,:64], r2
    vrshr.s16       q14, q14, #6
    vld1.64         {d7}, [r0,:64], r2
    vrshr.s16       q15, q15, #6
    sub             r0,  r0,  r2,  lsl #3

    vaddw.u8        q8,  q8,  d0
    vaddw.u8        q9,  q9,  d1
    vaddw.u8        q10, q10, d2
    vqmovun.s16     d0,  q8
    vqmovun.s16     d1,  q9
    vqmovun.s16     d2,  q10
    vaddw.u8        q11, q11, d3
    vst1.64         {d0}, [r0,:64], r2
    vaddw.u8        q12, q12, d4
    vst1.64         {d1}, [r0,:64], r2
    vaddw.u8        q13, q13, d5
    vst1.64         {d2}, [r0,:64], r2
    vqmovun.s16     d3,  q11
    vqmovun.s16     d4,  q12
    vaddw.u8        q14, q14, d6
    vaddw.u8        q15, q15, d7
    vst1.64         {d3}, [r0,:64], r2
    vqmovun.s16     d5,  q13
    vst1.64         {d4}, [r0,:64], r2
    vqmovun.s16     d6,  q14
    vqmovun.s16     d7,  q15
    vst1.64         {d5}, [r0,:64], r2
    vst1.64         {d6}, [r0,:64], r2
    vst1.64         {d7}, [r0,:64], r2
    bx              lr
.endfunc

function x264_add16x16_idct8_neon
    mov             ip,  lr
    bl              x264_add8x8_idct8_neon
    sub             r0,  r0,  #8*FDEC_STRIDE-8
    bl              x264_add8x8_idct8_neon
    sub             r0,  r0,  #8
    bl              x264_add8x8_idct8_neon
    sub             r0,  r0,  #8*FDEC_STRIDE-8
    mov             lr,  ip
    b               x264_add8x8_idct8_neon
.endfunc


function x264_add8x8_idct_dc_neon
    mov             r2,  #FDEC_STRIDE
    vld1.64         {d16}, [r1,:64]
    vrshr.s16       d16, d16, #6
    vld1.64         {d0}, [r0,:64], r2
    vmov.i16        q15, #0
    vld1.64         {d1}, [r0,:64], r2
    vld1.64         {d2}, [r0,:64], r2
    vdup.16         d20, d16[0]
    vld1.64         {d3}, [r0,:64], r2
    vdup.16         d21, d16[1]
    vld1.64         {d4}, [r0,:64], r2
    vdup.16         d22, d16[2]
    vld1.64         {d5}, [r0,:64], r2
    vdup.16         d23, d16[3]
    vld1.64         {d6}, [r0,:64], r2
    vsub.s16        q12, q15, q10
    vld1.64         {d7}, [r0,:64], r2
    vsub.s16        q13, q15, q11

    sub             r0,  r0,  #8*FDEC_STRIDE

    vqmovun.s16     d20, q10
    vqmovun.s16     d22, q11
    vqmovun.s16     d24, q12
    vqmovun.s16     d26, q13

    vmov            d21, d20
    vqadd.u8        q0,  q0,  q10
    vmov            d23, d22
    vqadd.u8        q1,  q1,  q10
    vmov            d25, d24
    vqadd.u8        q2,  q2,  q11
    vmov            d27, d26
    vqadd.u8        q3,  q3,  q11
    vqsub.u8        q0,  q0,  q12
    vqsub.u8        q1,  q1,  q12
    vqsub.u8        q2,  q2,  q13

    vst1.64         {d0}, [r0,:64], r2
    vqsub.u8        q3,  q3,  q13
    vst1.64         {d1}, [r0,:64], r2
    vst1.64         {d2}, [r0,:64], r2
    vst1.64         {d3}, [r0,:64], r2
    vst1.64         {d4}, [r0,:64], r2
    vst1.64         {d5}, [r0,:64], r2
    vst1.64         {d6}, [r0,:64], r2
    vst1.64         {d7}, [r0,:64], r2
    bx              lr
.endfunc

.macro ADD16x4_IDCT_DC dc
    vld1.64         {d16-d17}, [r0,:128], r3
    vld1.64         {d18-d19}, [r0,:128], r3
    vdup.16         d4,  \dc[0]
    vdup.16         d5,  \dc[1]
    vld1.64         {d20-d21}, [r0,:128], r3
    vdup.16         d6,  \dc[2]
    vdup.16         d7,  \dc[3]
    vld1.64         {d22-d23}, [r0,:128], r3
    vsub.s16        q12, q15, q2
    vsub.s16        q13, q15, q3

    vqmovun.s16     d4,  q2
    vqmovun.s16     d5,  q3
    vqmovun.s16     d6,  q12
    vqmovun.s16     d7,  q13

    vqadd.u8        q8,  q8,  q2
    vqadd.u8        q9,  q9,  q2
    vqadd.u8        q10, q10, q2
    vqadd.u8        q11, q11, q2

    vqsub.u8        q8,  q8,  q3
    vqsub.u8        q9,  q9,  q3
    vqsub.u8        q10, q10, q3
    vst1.64         {d16-d17}, [r2,:128], r3
    vqsub.u8        q11, q11, q3
    vst1.64         {d18-d19}, [r2,:128], r3
    vst1.64         {d20-d21}, [r2,:128], r3
    vst1.64         {d22-d23}, [r2,:128], r3
.endm

function x264_add16x16_idct_dc_neon
    mov             r2,  r0
    mov             r3,  #FDEC_STRIDE
    vmov.i16        q15, #0

    vld1.64         {d0-d3}, [r1,:64]
    vrshr.s16       q0, #6
    vrshr.s16       q1, #6

    ADD16x4_IDCT_DC d0
    ADD16x4_IDCT_DC d1
    ADD16x4_IDCT_DC d2
    ADD16x4_IDCT_DC d3
    bx              lr
.endfunc

function x264_sub8x8_dct_dc_neon
    mov             r3,  #FENC_STRIDE
    mov             ip,  #FDEC_STRIDE
    vld1.64         {d16}, [r1,:64], r3
    vld1.64         {d17}, [r2,:64], ip
    vsubl.u8        q8,  d16, d17
    vld1.64         {d18}, [r1,:64], r3
    vld1.64         {d19}, [r2,:64], ip
    vsubl.u8        q9,  d18, d19
    vld1.64         {d20}, [r1,:64], r3
    vld1.64         {d21}, [r2,:64], ip
    vsubl.u8        q10, d20, d21
    vld1.64         {d22}, [r1,:64], r3
    vadd.s16        q0,  q8,  q9
    vld1.64         {d23}, [r2,:64], ip
    vsubl.u8        q11, d22, d23
    vld1.64         {d24}, [r1,:64], r3
    vadd.s16        q0,  q0,  q10
    vld1.64         {d25}, [r2,:64], ip
    vsubl.u8        q12, d24, d25
    vld1.64         {d26}, [r1,:64], r3
    vadd.s16        q0,  q0,  q11
    vld1.64         {d27}, [r2,:64], ip
    vsubl.u8        q13, d26, d27
    vld1.64         {d28}, [r1,:64], r3
    vld1.64         {d29}, [r2,:64], ip
    vsubl.u8        q14, d28, d29
    vld1.64         {d30}, [r1,:64], r3
    vadd.s16        q1,  q12, q13
    vld1.64         {d31}, [r2,:64], ip
    vsubl.u8        q15, d30, d31
    vadd.s16        q1,  q1,  q14

    vadd.s16        d4,  d0,  d1
    vadd.s16        q1,  q1,  q15
    vsub.s16        d5,  d0,  d1
    vadd.s16        d6,  d2,  d3
    vsub.s16        d7,  d2,  d3
    vadd.s16        q0,  q2,  q3
    vsub.s16        q1,  q2,  q3

    vpadd.s16       d0,  d0,  d2
    vpadd.s16       d1,  d1,  d3
    vpadd.s16       d0,  d0,  d1
    vst1.64         {d0}, [r0,:64]
    bx              lr
.endfunc


function x264_zigzag_scan_4x4_frame_neon
    movrel      r2, scan4x4_frame
    vld1.64     {d0-d3},   [r1,:128]
    vld1.64     {d16-d19}, [r2,:128]
    vtbl.8      d4, {d0-d1}, d16
    vtbl.8      d5, {d1-d3}, d17
    vtbl.8      d6, {d0-d2}, d18
    vtbl.8      d7, {d2-d3}, d19
    vst1.64     {d4-d7},   [r0,:128]
    bx          lr
.endfunc
x264-snapshot-20120103-2245-stable/common/arm/cpu-a.S0000644000175000001440000000667211700673342021104 0ustar  videolanusers/*****************************************************************************
 * cpu-a.S: arm cpu detection
 *****************************************************************************
 * Copyright (C) 2009-2011 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "asm.S"

.fpu neon
.align

// done in gas because .fpu neon overrides the refusal to assemble
// instructions the selected -march/-mcpu doesn't support
function x264_cpu_neon_test
    vadd.i16    q0, q0, q0
    bx          lr
.endfunc

// return: 0 on success
//         1 if counters were already enabled
//         9 if lo-res counters were already enabled
function x264_cpu_enable_armv7_counter
    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
    ands        r0, r2, #1
    andne       r0, r2, #9

    orr         r2, r2, #1                  // enable counters
    bic         r2, r2, #8                  // full resolution
    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
    mov         r2, #1 << 31                // enable cycle counter
    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
    bx          lr
.endfunc

function x264_cpu_disable_armv7_counter
    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
    bic         r0, r0, #1                  // disable counters
    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
    bx          lr
.endfunc


.macro READ_TIME r
    mrc         p15, 0, \r, c9, c13, 0
.endm

// return: 0 if transfers neon -> arm transfers take more than 10 cycles
//         nonzero otherwise
function x264_cpu_fast_neon_mrc_test
    // check for user access to performance counters
    mrc         p15, 0, r0, c9, c14, 0
    cmp         r0, #0
    bxeq        lr

    push        {r4-r6,lr}
    bl          x264_cpu_enable_armv7_counter
    ands        r1, r0, #8
    mov         r3, #0
    mov         ip, #4
    mov         r6, #4
    moveq       r5, #1
    movne       r5, #64

average_loop:
    mov         r4, r5
    READ_TIME   r1
1:  subs        r4, r4, #1
.rept 8
    vmov.u32    lr, d0[0]
    add         lr, lr, lr
.endr
    bgt         1b
    READ_TIME   r2

    subs        r6, r6, #1
    sub         r2, r2, r1
    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
    addle       r3, r3, r2
    subles      ip, ip, #1
    bgt         average_loop

    // disable counters if we enabled them
    ands        r0, r0, #1
    bleq        x264_cpu_disable_armv7_counter

    lsr         r0, r3, #5
    cmp         r0, #10
    movgt       r0, #0
    pop         {r4-r6,pc}
.endfunc
x264-snapshot-20120103-2245-stable/common/arm/asm.S0000644000175000001440000000726211700673342020653 0ustar  videolanusers/*****************************************************************************
 * asm.S: arm utility macros
 *****************************************************************************
 * Copyright (C) 2008-2011 x264 project
 *
 * Authors: Mans Rullgard <mans@mansr.com>
 *          David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "config.h"

#ifdef PREFIX
#   define EXTERN_ASM _
#else
#   define EXTERN_ASM
#endif

#ifdef __ELF__
#   define ELF
#else
#   define ELF @
#endif

        .macro require8, val=1
ELF     .eabi_attribute 24, \val
        .endm

        .macro preserve8, val=1
ELF     .eabi_attribute 25, \val
        .endm

        .macro function name
        .global EXTERN_ASM\name
EXTERN_ASM\name:
ELF     .hidden \name
ELF     .type   \name, %function
        .func   \name
\name:
        .endm

        .macro movrel rd, val
#if HAVE_ARMV6T2 && !defined(PIC)
        movw            \rd, #:lower16:\val
        movt            \rd, #:upper16:\val
#else
        ldr             \rd, =\val
#endif
        .endm

.macro movconst rd, val
#if HAVE_ARMV6T2
    movw        \rd, #:lower16:\val
.if \val >> 16
    movt        \rd, #:upper16:\val
.endif
#else
    ldr         \rd, =\val
#endif
.endm

#define FENC_STRIDE 16
#define FDEC_STRIDE 32

.macro HORIZ_ADD dest, a, b
.ifnb \b
    vadd.u16    \a, \a, \b
.endif
    vpaddl.u16  \a, \a
    vpaddl.u32  \dest, \a
.endm

.macro SUMSUB_AB sum, diff, a, b
    vadd.s16    \sum,  \a, \b
    vsub.s16    \diff, \a, \b
.endm

.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
    SUMSUB_AB   \s1, \d1, \a, \b
    SUMSUB_AB   \s2, \d2, \c, \d
.endm

.macro ABS2 a b
    vabs.s16 \a, \a
    vabs.s16 \b, \b
.endm

// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
// op = sumsub/amax (sum and diff / maximum of absolutes)
// d1/2 = destination registers
// s1/2 = source registers
.macro HADAMARD dist, op, d1, d2, s1, s2
.if \dist == 1
    vtrn.16     \s1, \s2
.else
    vtrn.32     \s1, \s2
.endif
.ifc \op, sumsub
    SUMSUB_AB   \d1, \d2, \s1, \s2
.else
    vabs.s16    \s1, \s1
    vabs.s16    \s2, \s2
    vmax.s16    \d1, \s1, \s2
.endif
.endm

.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
    vtrn.32         \r0, \r4
    vtrn.32         \r1, \r5
    vtrn.32         \r2, \r6
    vtrn.32         \r3, \r7
    vtrn.16         \r0, \r2
    vtrn.16         \r1, \r3
    vtrn.16         \r4, \r6
    vtrn.16         \r5, \r7
    vtrn.8          \r0, \r1
    vtrn.8          \r2, \r3
    vtrn.8          \r4, \r5
    vtrn.8          \r6, \r7
.endm

.macro TRANSPOSE4x4 r0 r1 r2 r3
    vtrn.16         \r0, \r2
    vtrn.16         \r1, \r3
    vtrn.8          \r0, \r1
    vtrn.8          \r2, \r3
.endm

.macro TRANSPOSE4x4_16  d0 d1 d2 d3
    vtrn.32     \d0, \d2
    vtrn.32     \d1, \d3
    vtrn.16     \d0, \d1
    vtrn.16     \d2, \d3
.endm
x264-snapshot-20120103-2245-stable/Makefile0000644000175000001440000001710311700673342017333 0ustar  videolanusers# Makefile

include config.mak

all: default

SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
       common/frame.c common/dct.c common/cpu.c common/cabac.c \
       common/common.c common/osdep.c common/rectangle.c \
       common/set.c common/quant.c common/deblock.c common/vlc.c \
       common/mvpred.c common/bitstream.c \
       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
       encoder/set.c encoder/macroblock.c encoder/cabac.c \
       encoder/cavlc.c encoder/encoder.c encoder/lookahead.c

SRCCLI = x264.c input/input.c input/timecode.c input/raw.c input/y4m.c \
         output/raw.c output/matroska.c output/matroska_ebml.c \
         output/flv.c output/flv_bytestream.c filters/filters.c \
         filters/video/video.c filters/video/source.c filters/video/internal.c \
         filters/video/resize.c filters/video/cache.c filters/video/fix_vfr_pts.c \
         filters/video/select_every.c filters/video/crop.c filters/video/depth.c

SRCSO =

OBJCHK = tools/checkasm.o

CONFIG := $(shell cat config.h)

# GPL-only files
ifneq ($(findstring HAVE_GPL 1, $(CONFIG)),)
SRCCLI +=
endif

# Optional module sources
ifneq ($(findstring HAVE_AVS 1, $(CONFIG)),)
SRCCLI += input/avs.c
endif

ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),)
SRCCLI += input/thread.c
SRCS   += common/threadpool.c
endif

ifneq ($(findstring HAVE_WIN32THREAD 1, $(CONFIG)),)
SRCS += common/win32thread.c
endif

ifneq ($(findstring HAVE_LAVF 1, $(CONFIG)),)
SRCCLI += input/lavf.c
endif

ifneq ($(findstring HAVE_FFMS 1, $(CONFIG)),)
SRCCLI += input/ffms.c
endif

ifneq ($(findstring HAVE_GPAC 1, $(CONFIG)),)
SRCCLI += output/mp4.c
endif

# Visualization sources
ifneq ($(findstring HAVE_VISUALIZE 1, $(CONFIG)),)
SRCS   += common/visualize.c common/display-x11.c
endif

# MMX/SSE optims
ifneq ($(AS),)
X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
          mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm \
          cpu-a.asm dct-32.asm bitstream-a.asm
ifneq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),)
X86SRC0 += sad16-a.asm
else
X86SRC0 += sad-a.asm
endif
X86SRC = $(X86SRC0:%=common/x86/%)

ifeq ($(ARCH),X86)
ARCH_X86 = yes
ASMSRC   = $(X86SRC) common/x86/pixel-32.asm
endif

ifeq ($(ARCH),X86_64)
ARCH_X86 = yes
ASMSRC   = $(X86SRC:-32.asm=-64.asm)
ASFLAGS += -DARCH_X86_64
endif

ifdef ARCH_X86
ASFLAGS += -Icommon/x86/
SRCS   += common/x86/mc-c.c common/x86/predict-c.c
OBJASM  = $(ASMSRC:%.asm=%.o)
$(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm
OBJCHK += tools/checkasm-a.o
endif
endif

# AltiVec optims
ifeq ($(ARCH),PPC)
ifneq ($(AS),)
SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
        common/ppc/quant.c common/ppc/deblock.c \
        common/ppc/predict.c
endif
endif

# NEON optims
ifeq ($(ARCH),ARM)
ifneq ($(AS),)
ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
          common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
          common/arm/predict-a.S
SRCS   += common/arm/mc-c.c common/arm/predict-c.c
OBJASM  = $(ASMSRC:%.S=%.o)
endif
endif

# VIS optims
ifeq ($(ARCH),UltraSPARC)
ifeq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),)
ASMSRC += common/sparc/pixel.asm
OBJASM  = $(ASMSRC:%.asm=%.o)
endif
endif

ifneq ($(HAVE_GETOPT_LONG),1)
SRCCLI += extras/getopt.c
endif

ifneq ($(SONAME),)
ifeq ($(SYS),WINDOWS)
SRCSO += x264dll.c
endif
endif

OBJS = $(SRCS:%.c=%.o)
OBJCLI = $(SRCCLI:%.c=%.o)
OBJSO = $(SRCSO:%.c=%.o)
DEP  = depend

.PHONY: all default fprofiled clean distclean install uninstall lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli

default: $(DEP)

cli: x264$(EXE)
lib-static: $(LIBX264)
lib-shared: $(SONAME)

$(LIBX264): .depend $(OBJS) $(OBJASM)
	rm -f $(LIBX264)
	$(AR)$@ $(OBJS) $(OBJASM)
	$(if $(RANLIB), $(RANLIB) $@)

$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)

ifneq ($(EXE),)
.PHONY: x264 checkasm
x264: x264$(EXE)
checkasm: checkasm$(EXE)
endif

x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
	$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)

checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
	$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)

$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend

%.o: %.asm
	$(AS) $(ASFLAGS) -o $@ $<
	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile

%.o: %.S
	$(AS) $(ASFLAGS) -o $@ $<
	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile

.depend: config.mak
	@rm -f .depend
	@$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:%.c=%.o) $(DEPMM) 1>> .depend;)

config.mak:
	./configure

depend: .depend
ifneq ($(wildcard .depend),)
include .depend
endif

SRC2 = $(SRCS) $(SRCCLI)
# These should cover most of the important codepaths
OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb
OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0  --slice-max-mbs 50
OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500
OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree
OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4
OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2
OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall
OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac

ifeq (,$(VIDS))
fprofiled:
	@echo 'usage: make fprofiled VIDS="infile1 infile2 ..."'
	@echo 'where infiles are anything that x264 understands,'
	@echo 'i.e. YUV with resolution in the filename, y4m, or avisynth.'
else
fprofiled:
	$(MAKE) clean
	$(MAKE) x264$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)"
	$(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;))
	rm -f $(SRC2:%.c=%.o)
	$(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)"
	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
endif

clean:
	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
	rm -f checkasm checkasm.exe $(OBJCHK)
	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock

distclean: clean
	rm -f config.mak x264_config.h config.h config.log x264.pc x264.def

install-cli: cli
	install -d $(DESTDIR)$(bindir)
	install x264$(EXE) $(DESTDIR)$(bindir)

install-lib-dev:
	install -d $(DESTDIR)$(includedir)
	install -d $(DESTDIR)$(libdir)
	install -d $(DESTDIR)$(libdir)/pkgconfig
	install -m 644 x264.h $(DESTDIR)$(includedir)
	install -m 644 x264_config.h $(DESTDIR)$(includedir)
	install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig

install-lib-static: lib-static install-lib-dev
	install -m 644 $(LIBX264) $(DESTDIR)$(libdir)
	$(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBX264))

install-lib-shared: lib-shared install-lib-dev
ifneq ($(IMPLIBNAME),)
	install -d $(DESTDIR)$(bindir)
	install -m 755 $(SONAME) $(DESTDIR)$(bindir)
	install -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir)
else ifneq ($(SONAME),)
	ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
	install -m 755 $(SONAME) $(DESTDIR)$(libdir)
endif

uninstall:
	rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(includedir)/x264_config.h $(DESTDIR)$(libdir)/libx264.a
	rm -f $(DESTDIR)$(bindir)/x264$(EXE) $(DESTDIR)$(libdir)/pkgconfig/x264.pc
ifneq ($(IMPLIBNAME),)
	rm -f $(DESTDIR)$(bindir)/$(SONAME) $(DESTDIR)$(libdir)/$(IMPLIBNAME)
else ifneq ($(SONAME),)
	rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
endif

etags: TAGS

TAGS:
	etags $(SRCS)
x264-snapshot-20120103-2245-stable/COPYING0000644000175000001440000004311011700673342016723 0ustar  videolanusers		    GNU GENERAL PUBLIC LICENSE
		       Version 2, June 1991

 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

			    Preamble

  The licenses for most software are designed to take away your
freedom to share and change it.  By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users.  This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it.  (Some other Free Software Foundation software is covered by
the GNU Library General Public License instead.)  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.

  To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have.  You must make sure that they, too, receive or can get the
source code.  And you must show them these terms so they know their
rights.

  We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.

  Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software.  If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.

  Finally, any free program is threatened constantly by software
patents.  We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary.  To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.

  The precise terms and conditions for copying, distribution and
modification follow.

		    GNU GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

  0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License.  The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language.  (Hereinafter, translation is included without limitation in
the term "modification".)  Each licensee is addressed as "you".

Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope.  The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.

  1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.

You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.

  2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:

    a) You must cause the modified files to carry prominent notices
    stating that you changed the files and the date of any change.

    b) You must cause any work that you distribute or publish, that in
    whole or in part contains or is derived from the Program or any
    part thereof, to be licensed as a whole at no charge to all third
    parties under the terms of this License.

    c) If the modified program normally reads commands interactively
    when run, you must cause it, when started running for such
    interactive use in the most ordinary way, to print or display an
    announcement including an appropriate copyright notice and a
    notice that there is no warranty (or else, saying that you provide
    a warranty) and that users may redistribute the program under
    these conditions, and telling the user how to view a copy of this
    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)

These requirements apply to the modified work as a whole.  If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works.  But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.

Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.

In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.

  3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:

    a) Accompany it with the complete corresponding machine-readable
    source code, which must be distributed under the terms of Sections
    1 and 2 above on a medium customarily used for software interchange; or,

    b) Accompany it with a written offer, valid for at least three
    years, to give any third party, for a charge no more than your
    cost of physically performing source distribution, a complete
    machine-readable copy of the corresponding source code, to be
    distributed under the terms of Sections 1 and 2 above on a medium
    customarily used for software interchange; or,

    c) Accompany it with the information you received as to the offer
    to distribute corresponding source code.  (This alternative is
    allowed only for noncommercial distribution and only if you
    received the program in object code or executable form with such
    an offer, in accord with Subsection b above.)

The source code for a work means the preferred form of the work for
making modifications to it.  For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable.  However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.

If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.

  4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License.  Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.

  5. You are not required to accept this License, since you have not
signed it.  However, nothing else grants you permission to modify or
distribute the Program or its derivative works.  These actions are
prohibited by law if you do not accept this License.  Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.

  6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions.  You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.

  7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all.  For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.

If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.

It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices.  Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.

This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.

  8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded.  In such case, this License incorporates
the limitation as if written in the body of this License.

  9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

Each version is given a distinguishing version number.  If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation.  If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.

  10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission.  For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this.  Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.

			    NO WARRANTY

  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.

  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.

		     END OF TERMS AND CONDITIONS

	    How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


Also add information on how to contact you by electronic and paper mail.

If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:

    Gnomovision version 69, Copyright (C) year  name of author
    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.

You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary.  Here is a sample; alter the names:

  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.

  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice

This General Public License does not permit incorporating your program into
proprietary programs.  If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library.  If this is what you want to do, use the GNU Library General
Public License instead of this License.
x264-snapshot-20120103-2245-stable/AUTHORS0000644000175000001440000000407011700673342016742 0ustar  videolanusers# Contributors to x264
# 
# The format of this file was inspired by the Linux kernel CREDITS file.
# Authors are listed alphabetically.
# 
# The fields are: name (N), email (E), web-address (W), CVS account login (C),
# PGP key ID and fingerprint (P), description (D), and snail-mail address (S).

N: Alex Izvorski
E: aizvorski AT gmail DOT com
D: x86 asm (sse2)

N: Alex Wright
E: alexw0885 AT gmail DOT com
D: Motion estimation (subpel and mixed refs)
D: B-RDO

N: bobololo
D: Avisynth input
D: MP4 muxing

N: Christian Heine
E: sennindemokrit AT gmx DOT net
D: x86 asm

N: David Wolstencroft
D: Altivec optimizations

N: Eric Petit
E: eric.petit AT lapsus DOT org
C: titer
D: Altivec asm
D: BeOS and MacOS X ports.
S: France

N: Gabriel Bouvigne
E: bouvigne AT mp3-tech DOT org
D: 2pass VBV

N: Guillaume Poirier
E: gpoirier CHEZ mplayerhq POINT hu
D: Altivec optimizations
S: Brittany, France

N: Henrik Gramner
E: hengar-6 AT student DOT ltu DOT se
D: 4:2:2 chroma subsampling, x86 asm
S: Sweden

N: Jason Garrett-Glaser
E: darkshikari AT gmail DOT com
D: x86 asm, 1pass VBV, adaptive quantization, inline asm
D: various speed optimizations, bugfixes
S: USA

N: Laurent Aimar
E: fenrir AT via.ecp DOT fr
C: fenrir
D: Intial import, former maintainer
D: x86 asm (mmx/mmx2)
S: France

N: Loren Merritt
E: lorenm AT u.washington DOT edu
C: pengvado
D: maintainer
D: All areas of encoder analysis and algorithms
D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc.
D: Multithreading
D: x86 and x86_64 asm (mmx/mmx2/sse2)
S: USA

N: Mans Rullgard
E: mru AT inprovide DOT com
C: mru
D: Rate control
S: Southampton, UK

N: Michael Niedermayer
E: michaelni AT gmx DOT at
D: Rate control

N: Mike Matsnev
E: mike AT po DOT cs DOT msu DOT su
D: Matroska muxing

N: Min Chen
E: chenm001 AT 163 DOT com
C: chenm001
D: Win32/VC 6.0 port
D: gcc asm to nasm conversion
S: China

N: Phil Jensen
E: philj AT csufresno DOT edu
D: SPARC asm

N: Radek Czyz
E: radoslaw AT syskin DOT cjb DOT net
D: Cached motion compensation

N: Tuukka Toivonen
E: tuukkat AT ee DOT oulu DOT fi
D: Visualization

x264-snapshot-20120103-2245-stable/.gitignore0000644000175000001440000000047311700673342017665 0ustar  videolanusers*~
*.a
*.diff
*.orig
*.rej
*.dll*
*.exe
*.def
*.lib
*.pdb
*.mo
*.o
*.patch
*.pc
*.pot
*.so*
*.dylib
.*.swp
.depend
.DS_Store
config.h
config.mak
config.log
x264_config.h
x264
checkasm

*.264
*.h264
*.2pass
*.ffindex
*.avs
*.mkv
*.flv
*.mp4
*.y4m
*.yuv
*.log
*.mbtree
*.temp
*.pyc

.digress_x264
dataDec.txt
log.dec