From 5afc794eed0583af8ac7b4e394b36bd5dea9b164 Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Mon, 6 Jul 2015 11:37:20 +1000
Subject: [PATCH 01/21] Upgrade xxHash to r49

---
 xxhash.c | 1125 ++++++++++++++++++++++++++++++++++++++++--------------
 xxhash.h |  158 +++++---
 2 files changed, 960 insertions(+), 323 deletions(-)

diff --git a/xxhash.c b/xxhash.c
index 164f74d..e6fb8f1 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -1,342 +1,915 @@
 /*
-   xxHash - Fast Hash algorithm
-   Copyright (C) 2012, Yann Collet.
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-	You can contact the author at :
-	- xxHash source repository : http://code.google.com/p/xxhash/
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2015, Yann Collet
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 
+/**************************************
+*  Tuning parameters
+**************************************/
+/* Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+ * For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+ * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+ * You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+ */
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
 
-//**************************************
-// Tuning parameters
-//**************************************
-// FORCE_NATIVE_FORMAT :
-// By default, xxHash library provides endian-independant Hash values.
-// Results are therefore identical for big-endian and little-endian CPU.
-// This comes at a  performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independance be of no importance to your application, you may uncomment the #define below
-// It will improve speed for Big-endian CPU.
-// This option has no impact on Little_Endian CPU.
-//#define FORCE_NATIVE_FORMAT 1
-
+/* XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/* XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1.
+ * It will improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#define XXH_FORCE_NATIVE_FORMAT 0
+
+
+/**************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define FORCE_INLINE static __forceinline
+#else
+#  if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
 
 
-//**************************************
-// Includes
-//**************************************
-#include <stdlib.h>    // for malloc(), free()
-#include <string.h>    // for memcpy()
+/**************************************
+*  Includes & Memory related functions
+***************************************/
 #include "xxhash.h"
-
-
-
-//**************************************
-// CPU Feature Detection
-//**************************************
-// Little Endian or Big Endian ?
-// You can overwrite the #define below if you know your architecture endianess
-#if defined(FORCE_NATIVE_FORMAT) && (FORCE_NATIVE_FORMAT==1)
-// Force native format. The result will be endian dependant.
-#  define XXH_BIG_ENDIAN 0
-#elif defined (__GLIBC__)
-#  include <endian.h>
-#  if (__BYTE_ORDER == __BIG_ENDIAN)
-#     define XXH_BIG_ENDIAN 1
-#  endif
-#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
-#  define XXH_BIG_ENDIAN 1
-#elif defined(__sparc) || defined(__sparc__) \
-   || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
-   || defined(__hpux)  || defined(__hppa) \
-   || defined(_MIPSEB) || defined(__s390__)
-#  define XXH_BIG_ENDIAN 1
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+
+/**************************************
+*  Basic Types
+***************************************/
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
 #endif
 
-#if !defined(XXH_BIG_ENDIAN)
-// Little Endian assumed. PDP Endian and other very rare endian format are unsupported.
-#  define XXH_BIG_ENDIAN 0
-#endif
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val32;
+    memcpy(&val32, memPtr, 4);
+    return val32;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val64;
+    memcpy(&val64, memPtr, 8);
+    return val64;
+}
 
 
 
-//**************************************
-// Compiler-specific Options & Functions
-//**************************************
+/******************************************
+*  Compiler-specific Functions and Macros
+******************************************/
 #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 
-// Note : under GCC, it may sometimes be faster to enable the (2nd) macro definition, instead of using win32 intrinsic
-#if defined(_WIN32)
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
 #else
 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
 #endif
 
-#if defined(_MSC_VER)     // Visual Studio
+#if defined(_MSC_VER)     /* Visual Studio */
 #  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
 #elif GCC_VERSION >= 403
 #  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
 #else
-static inline unsigned int XXH_swap32 (unsigned int x) {
-                        return  ((x << 24) & 0xff000000 ) |
-                                ((x <<  8) & 0x00ff0000 ) |
-                                ((x >>  8) & 0x0000ff00 ) |
-                                ((x >> 24) & 0x000000ff );
-                 }
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
 #endif
 
 
+/***************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+#ifndef XXH_CPU_LITTLE_ENDIAN   /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example using a compiler switch */
+static const int one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
+#endif
+
+
+/*****************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
 
-//**************************************
-// Constants
-//**************************************
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+
+/***************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/***************************************
+*  Constants
+***************************************/
 #define PRIME32_1   2654435761U
 #define PRIME32_2   2246822519U
 #define PRIME32_3   3266489917U
 #define PRIME32_4    668265263U
 #define PRIME32_5    374761393U
 
+#define PRIME64_1 11400714785074694791ULL
+#define PRIME64_2 14029467366897019727ULL
+#define PRIME64_3  1609587929392839161ULL
+#define PRIME64_4  9650029242287828579ULL
+#define PRIME64_5  2870177450012600261ULL
 
 
-//**************************************
-// Macros
-//**************************************
-#define XXH_LE32(p)  (XXH_BIG_ENDIAN ? XXH_swap32(*(unsigned int*)(p)) : *(unsigned int*)(p))
+/*****************************
+*  Simple Hash Functions
+*****************************/
+FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL)
+    {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_get32bits(p) * PRIME32_2;
+            v1 = XXH_rotl32(v1, 13);
+            v1 *= PRIME32_1;
+            p+=4;
+            v2 += XXH_get32bits(p) * PRIME32_2;
+            v2 = XXH_rotl32(v2, 13);
+            v2 *= PRIME32_1;
+            p+=4;
+            v3 += XXH_get32bits(p) * PRIME32_2;
+            v3 = XXH_rotl32(v3, 13);
+            v3 *= PRIME32_1;
+            p+=4;
+            v4 += XXH_get32bits(p) * PRIME32_2;
+            v4 = XXH_rotl32(v4, 13);
+            v4 *= PRIME32_1;
+            p+=4;
+        }
+        while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd)
+    {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
 
 
+unsigned XXH32 (const void* input, size_t len, unsigned seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, input, len);
+    return XXH32_digest(&state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
 
-//****************************
-// Simple Hash Functions
-//****************************
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
 
-unsigned int XXH32(const void* input, int len, unsigned int seed)
+FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL)
+    {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32)
+    {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do
+        {
+            v1 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v1 = XXH_rotl64(v1, 31);
+            v1 *= PRIME64_1;
+            v2 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v2 = XXH_rotl64(v2, 31);
+            v2 *= PRIME64_1;
+            v3 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v3 = XXH_rotl64(v3, 31);
+            v3 *= PRIME64_1;
+            v4 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v4 = XXH_rotl64(v4, 31);
+            v4 *= PRIME64_1;
+        }
+        while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+
+        v1 *= PRIME64_2;
+        v1 = XXH_rotl64(v1, 31);
+        v1 *= PRIME64_1;
+        h64 ^= v1;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v2 *= PRIME64_2;
+        v2 = XXH_rotl64(v2, 31);
+        v2 *= PRIME64_1;
+        h64 ^= v2;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v3 *= PRIME64_2;
+        v3 = XXH_rotl64(v3, 31);
+        v3 *= PRIME64_1;
+        h64 ^= v3;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v4 *= PRIME64_2;
+        v4 = XXH_rotl64(v4, 31);
+        v4 *= PRIME64_1;
+        h64 ^= v4;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+    }
+    else
+    {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd)
+    {
+        U64 k1 = XXH_get64bits(p);
+        k1 *= PRIME64_2;
+        k1 = XXH_rotl64(k1,31);
+        k1 *= PRIME64_1;
+        h64 ^= k1;
+        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd)
+    {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
 {
 #if 0
-	// Simple version, good for code maintenance, but unfortunately slow for small inputs
-	void* state = XXH32_init(seed);
-	XXH32_feed(state, input, len);
-	return XXH32_result(state);
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
 #else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
 
-	const unsigned char* p = (const unsigned char*)input;
-	const unsigned char* const bEnd = p + len;
-	unsigned int h32;
-
-	if (len>=16)
-	{
-		const unsigned char* const limit = bEnd - 16;
-		unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
-		unsigned int v2 = seed + PRIME32_2;
-		unsigned int v3 = seed + 0;
-		unsigned int v4 = seed - PRIME32_1;
-
-		do
-		{
-			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-		} while (p<=limit) ;
-
-		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-	}
-	else
-	{
-		h32  = seed + PRIME32_5;
-	}
-
-	h32 += (unsigned int) len;
-	
-	while (p<=bEnd-4)
-	{
-		h32 += XXH_LE32(p) * PRIME32_3;
-		h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
-		p+=4;
-	}
-
-	while (p<bEnd)
-	{
-		h32 += (*p) * PRIME32_5;
-		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-		p++;
-	}
-
-	h32 ^= h32 >> 15;
-	h32 *= PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= PRIME32_3;
-	h32 ^= h32 >> 16;
-
-	return h32;
+/****************************************************
+*  Advanced Hash Functions
+****************************************************/
 
+/*** Allocation ***/
+typedef struct
+{
+    U64 total_len;
+    U32 seed;
+    U32 v1;
+    U32 v2;
+    U32 v3;
+    U32 v4;
+    U32 mem32[4];   /* defined as U32 for alignment */
+    U32 memsize;
+} XXH_istate32_t;
+
+typedef struct
+{
+    U64 total_len;
+    U64 seed;
+    U64 v1;
+    U64 v2;
+    U64 v3;
+    U64 v4;
+    U64 mem64[4];   /* defined as U64 for alignment */
+    U32 memsize;
+} XXH_istate64_t;
+
+
+XXH32_state_t* XXH32_createState(void)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   /* A compilation error here means XXH32_state_t is not large enough */
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH64_state_t* XXH64_createState(void)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   /* A compilation error here means XXH64_state_t is not large enough */
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed)
+{
+    XXH_istate32_t* state = (XXH_istate32_t*) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
+{
+    XXH_istate64_t* state = (XXH_istate64_t*) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME64_1 + PRIME64_2;
+    state->v2 = seed + PRIME64_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME64_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+{
+    XXH_istate32_t* state = (XXH_istate32_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
 #endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   /* fill in tmp buffer */
+    {
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   /* some data left from previous update */
+    {
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {
+            const U32* p32 = state->mem32;
+            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v1 = XXH_rotl32(state->v1, 13);
+            state->v1 *= PRIME32_1;
+            p32++;
+            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v2 = XXH_rotl32(state->v2, 13);
+            state->v2 *= PRIME32_1;
+            p32++;
+            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v3 = XXH_rotl32(state->v3, 13);
+            state->v3 *= PRIME32_1;
+            p32++;
+            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v4 = XXH_rotl32(state->v4, 13);
+            state->v4 *= PRIME32_1;
+            p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE32(p, endian) * PRIME32_2;
+            v1 = XXH_rotl32(v1, 13);
+            v1 *= PRIME32_1;
+            p+=4;
+            v2 += XXH_readLE32(p, endian) * PRIME32_2;
+            v2 = XXH_rotl32(v2, 13);
+            v2 *= PRIME32_1;
+            p+=4;
+            v3 += XXH_readLE32(p, endian) * PRIME32_2;
+            v3 = XXH_rotl32(v3, 13);
+            v3 *= PRIME32_1;
+            p+=4;
+            v4 += XXH_readLE32(p, endian) * PRIME32_2;
+            v4 = XXH_rotl32(v4, 13);
+            v4 *= PRIME32_1;
+            p+=4;
+        }
+        while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->mem32, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
 }
 
+XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
 
-//****************************
-// Advanced Hash Functions
-//****************************
 
-struct XXH_state32_t
+
+FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
 {
-	unsigned int seed;
-	unsigned int v1;
-	unsigned int v2;
-	unsigned int v3;
-	unsigned int v4;
-	unsigned long long total_len;
-	char memory[16];
-	int memsize;
-};
-
-
-void* XXH32_init (unsigned int seed)
+    const XXH_istate32_t* state = (const XXH_istate32_t*) state_in;
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (U32) state->total_len;
+
+    while (p+4<=bEnd)
+    {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+U32 XXH32_digest (const XXH32_state_t* state_in)
 {
-	struct XXH_state32_t * state = (struct XXH_state32_t *) malloc ( sizeof(struct XXH_state32_t));
-	state->seed = seed;
-	state->v1 = seed + PRIME32_1 + PRIME32_2;
-	state->v2 = seed + PRIME32_2;
-	state->v3 = seed + 0;
-	state->v4 = seed - PRIME32_1;
-	state->total_len = 0;
-	state->memsize = 0;
-
-	return (void*)state;
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
 }
 
 
-int   XXH32_feed (void* state_in, const void* input, int len)
+FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+{
+    XXH_istate64_t * state = (XXH_istate64_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32)   /* fill in tmp buffer */
+    {
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   /* some data left from previous update */
+    {
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        {
+            const U64* p64 = state->mem64;
+            state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v1 = XXH_rotl64(state->v1, 31);
+            state->v1 *= PRIME64_1;
+            p64++;
+            state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v2 = XXH_rotl64(state->v2, 31);
+            state->v2 *= PRIME64_1;
+            p64++;
+            state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v3 = XXH_rotl64(state->v3, 31);
+            state->v3 *= PRIME64_1;
+            p64++;
+            state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v4 = XXH_rotl64(state->v4, 31);
+            state->v4 *= PRIME64_1;
+            p64++;
+        }
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd)
+    {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE64(p, endian) * PRIME64_2;
+            v1 = XXH_rotl64(v1, 31);
+            v1 *= PRIME64_1;
+            p+=8;
+            v2 += XXH_readLE64(p, endian) * PRIME64_2;
+            v2 = XXH_rotl64(v2, 31);
+            v2 *= PRIME64_1;
+            p+=8;
+            v3 += XXH_readLE64(p, endian) * PRIME64_2;
+            v3 = XXH_rotl64(v3, 31);
+            v3 *= PRIME64_1;
+            p+=8;
+            v4 += XXH_readLE64(p, endian) * PRIME64_2;
+            v4 = XXH_rotl64(v4, 31);
+            v4 *= PRIME64_1;
+            p+=8;
+        }
+        while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->mem64, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
 {
-	struct XXH_state32_t * state = state_in;
-	const unsigned char* p = (const unsigned char*)input;
-	const unsigned char* const bEnd = p + len;
-
-	state->total_len += len;
-	
-	if (state->memsize + len < 16)   // fill in tmp buffer
-	{
-		memcpy(state->memory + state->memsize, input, len);
-		state->memsize +=  len;
-		return 0;
-	}
-
-	if (state->memsize)   // some data left from previous feed
-	{
-		memcpy(state->memory + state->memsize, input, 16-state->memsize);
-		{
-			const unsigned int* p32 = (const unsigned int*)state->memory;
-			state->v1 += XXH_LE32(p32) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
-			state->v2 += XXH_LE32(p32) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; 
-			state->v3 += XXH_LE32(p32) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
-			state->v4 += XXH_LE32(p32) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
-		}
-		p += 16-state->memsize;
-		state->memsize = 0;
-	}
-
-	{
-		const unsigned char* const limit = bEnd - 16;
-		unsigned int v1 = state->v1;
-		unsigned int v2 = state->v2;
-		unsigned int v3 = state->v3;
-		unsigned int v4 = state->v4;
-
-		while (p<=limit)
-		{
-			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-		}  
-
-		state->v1 = v1;
-		state->v2 = v2;
-		state->v3 = v3;
-		state->v4 = v4;
-	}
-
-	if (p < bEnd)
-	{
-		memcpy(state->memory, p, bEnd-p);
-		state->memsize = bEnd-p;
-	}
-
-	return 0;
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
 }
 
 
-unsigned int XXH32_getIntermediateResult (void* state_in)
+
+FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
 {
-	struct XXH_state32_t * state = state_in;
-	unsigned char * p   = (unsigned char*)state->memory;
-	unsigned char* bEnd = (unsigned char*)state->memory + state->memsize;
-	unsigned int h32;
-
-
-	if (state->total_len >= 16)
-	{
-		h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
-	}
-	else
-	{
-		h32  = state->seed + PRIME32_5;
-	}
-
-	h32 += (unsigned int) state->total_len;
-	
-	while (p<=bEnd-4)
-	{
-		h32 += XXH_LE32(p) * PRIME32_3;
-		h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
-		p+=4;
-	}
-
-	while (p<bEnd)
-	{
-		h32 += (*p) * PRIME32_5;
-		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-		p++;
-	}
-
-	h32 ^= h32 >> 15;
-	h32 *= PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= PRIME32_3;
-	h32 ^= h32 >> 16;
-
-	return h32;
+    const XXH_istate64_t * state = (const XXH_istate64_t *) state_in;
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32)
+    {
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+
+        v1 *= PRIME64_2;
+        v1 = XXH_rotl64(v1, 31);
+        v1 *= PRIME64_1;
+        h64 ^= v1;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v2 *= PRIME64_2;
+        v2 = XXH_rotl64(v2, 31);
+        v2 *= PRIME64_1;
+        h64 ^= v2;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v3 *= PRIME64_2;
+        v3 = XXH_rotl64(v3, 31);
+        v3 *= PRIME64_1;
+        h64 ^= v3;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v4 *= PRIME64_2;
+        v4 = XXH_rotl64(v4, 31);
+        v4 *= PRIME64_1;
+        h64 ^= v4;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+    }
+    else
+    {
+        h64  = state->seed + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd)
+    {
+        U64 k1 = XXH_readLE64(p, endian);
+        k1 *= PRIME64_2;
+        k1 = XXH_rotl64(k1,31);
+        k1 *= PRIME64_1;
+        h64 ^= k1;
+        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd)
+    {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
 }
 
 
-unsigned int XXH32_result (void* state_in)
+unsigned long long XXH64_digest (const XXH64_state_t* state_in)
 {
-    unsigned int h32 = XXH32_getIntermediateResult(state_in);
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
 
-	free(state_in);
 
-	return h32;
-}
\ No newline at end of file
diff --git a/xxhash.h b/xxhash.h
index a11e303..c60aa61 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -1,20 +1,21 @@
 /*
-   xxHash - Fast Hash algorithm
+   xxHash - Extremely Fast Hash algorithm
    Header File
-   Copyright (C) 2012, Yann Collet.
+   Copyright (C) 2012-2015, Yann Collet.
+
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
-  
+
        * Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above
    copyright notice, this list of conditions and the following disclaimer
    in the documentation and/or other materials provided with the
    distribution.
-  
+
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,8 +28,8 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-	You can contact the author at :
-	- xxHash source repository : http://code.google.com/p/xxhash/
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 /* Notice extracted from xxHash homepage :
@@ -52,9 +53,15 @@ CRC32           0.43 GB/s     9
 MD5-32          0.33 GB/s    10       Ronald L. Rivest
 SHA1-32         0.28 GB/s    10
 
-Q.Score is a measure of quality of the hash function. 
-It depends on successfully passing SMHasher test set. 
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
 10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
 */
 
 #pragma once
@@ -64,65 +71,122 @@ extern "C" {
 #endif
 
 
-//****************************
-// Simple Hash Functions
-//****************************
+/*****************************
+*  Definitions
+*****************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*****************************
+*  Namespace Emulation
+*****************************/
+/* Motivations :
+
+If you need to include xxHash into your library,
+but wish to avoid xxHash symbols to be present on your library interface
+in an effort to avoid potential name collision if another library also includes xxHash,
+
+you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
+with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
+
+Note that no change is required within the calling program :
+it can still call xxHash functions using their regular name.
+They will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#endif
+
+
+/*****************************
+*  Simple Hash Functions
+*****************************/
 
-unsigned int XXH32 (const void* input, int len, unsigned int seed);
+unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
+unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
 
 /*
 XXH32() :
-	Calculate the 32-bits hash of "input", of length "len"
-	"seed" can be used to alter the result
-	This function successfully passes all SMHasher tests.
-	Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-	Note that "len" is type "int", which means it is limited to 2^31-1.
-	If your data is larger, use the advanced functions below.
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    Faster on 64-bits systems. Slower on 32-bits systems.
 */
 
 
 
-//****************************
-// Advanced Hash Functions
-//****************************
-
-void*        XXH32_init   (unsigned int seed);
-int          XXH32_feed   (void* state, const void* input, int len);
-unsigned int XXH32_result (void* state);
+/*****************************
+*  Advanced Hash Functions
+*****************************/
+typedef struct { long long ll[ 6]; } XXH32_state_t;
+typedef struct { long long ll[11]; } XXH64_state_t;
 
 /*
-These functions calculate the xxhash of an input provided in several small packets,
-as opposed to an input provided as a single block.
+These structures allow static allocation of XXH states.
+States must then be initialized using XXHnn_reset() before first use.
 
-You must start with :
-void* XXH32_init()
-The function returns a pointer which holds the state of calculation.
+If you prefer dynamic allocation, please refer to functions below.
+*/
 
-This pointer must be provided as "void* state" parameter for XXH32_feed().
-XXH32_feed() can be called as many times as necessary.
-The function returns an error code, with 0 meaning OK, and all other values meaning there is an error.
-Note that "len" is type "int", which means it is limited to 2^31-1. 
-If your data is larger, it is recommended
-to chunk your data into blocks of size 2^30 (1GB) to avoid any "int" overflow issue.
+XXH32_state_t* XXH32_createState(void);
+XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
 
-Finally, you can end the calculation anytime, by using XXH32_result().
-This function returns the final 32-bits hash.
-You must provide the same "void* state" parameter created by XXH32_init().
+XXH64_state_t* XXH64_createState(void);
+XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
 
-Memory will be freed by XXH32_result().
+/*
+These functions create and release memory for XXH state.
+States must then be initialized using XXHnn_reset() before first use.
 */
 
 
-unsigned int XXH32_getIntermediateResult (void* state);
+XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
+XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
+
 /*
-This function does the same as XXH32_result(), generating a 32-bit hash,
-but preserve memory context.
-This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_feed().
-To free memory context, use XXH32_result().
-*/
+These functions calculate the xxHash of an input provided in multiple smaller packets,
+as opposed to an input provided as a single block.
 
+XXH state space must first be allocated, using either static or dynamic method provided above.
+
+Start a new hash by initializing state with a seed, using XXHnn_reset().
+
+Then, feed the hash state by calling XXHnn_update() as many times as necessary.
+Obviously, input must be valid, meaning allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, you can produce a hash anytime, by using XXHnn_digest().
+This function returns the final nn-bits hash.
+You can nonetheless continue feeding the hash state with more input,
+and therefore get some new hashes, by calling again XXHnn_digest().
+
+When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
+*/
 
 
 #if defined (__cplusplus)
 }
-#endif
\ No newline at end of file
+#endif

From e84410e6336ed890a184183a150f7673c8b8ac98 Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Mon, 6 Jul 2015 11:37:20 +1000
Subject: [PATCH 02/21] Upgrade xxHash to r40

---
 xxhash.c | 1125 ++++++++++++++++++++++++++++++++++++++++--------------
 xxhash.h |  158 +++++---
 2 files changed, 960 insertions(+), 323 deletions(-)

diff --git a/xxhash.c b/xxhash.c
index 164f74d..e6fb8f1 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -1,342 +1,915 @@
 /*
-   xxHash - Fast Hash algorithm
-   Copyright (C) 2012, Yann Collet.
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-	You can contact the author at :
-	- xxHash source repository : http://code.google.com/p/xxhash/
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2015, Yann Collet
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 
+/**************************************
+*  Tuning parameters
+**************************************/
+/* Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+ * For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+ * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+ * You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+ */
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
 
-//**************************************
-// Tuning parameters
-//**************************************
-// FORCE_NATIVE_FORMAT :
-// By default, xxHash library provides endian-independant Hash values.
-// Results are therefore identical for big-endian and little-endian CPU.
-// This comes at a  performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independance be of no importance to your application, you may uncomment the #define below
-// It will improve speed for Big-endian CPU.
-// This option has no impact on Little_Endian CPU.
-//#define FORCE_NATIVE_FORMAT 1
-
+/* XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/* XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1.
+ * It will improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#define XXH_FORCE_NATIVE_FORMAT 0
+
+
+/**************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define FORCE_INLINE static __forceinline
+#else
+#  if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
 
 
-//**************************************
-// Includes
-//**************************************
-#include <stdlib.h>    // for malloc(), free()
-#include <string.h>    // for memcpy()
+/**************************************
+*  Includes & Memory related functions
+***************************************/
 #include "xxhash.h"
-
-
-
-//**************************************
-// CPU Feature Detection
-//**************************************
-// Little Endian or Big Endian ?
-// You can overwrite the #define below if you know your architecture endianess
-#if defined(FORCE_NATIVE_FORMAT) && (FORCE_NATIVE_FORMAT==1)
-// Force native format. The result will be endian dependant.
-#  define XXH_BIG_ENDIAN 0
-#elif defined (__GLIBC__)
-#  include <endian.h>
-#  if (__BYTE_ORDER == __BIG_ENDIAN)
-#     define XXH_BIG_ENDIAN 1
-#  endif
-#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
-#  define XXH_BIG_ENDIAN 1
-#elif defined(__sparc) || defined(__sparc__) \
-   || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
-   || defined(__hpux)  || defined(__hppa) \
-   || defined(_MIPSEB) || defined(__s390__)
-#  define XXH_BIG_ENDIAN 1
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+
+/**************************************
+*  Basic Types
+***************************************/
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
 #endif
 
-#if !defined(XXH_BIG_ENDIAN)
-// Little Endian assumed. PDP Endian and other very rare endian format are unsupported.
-#  define XXH_BIG_ENDIAN 0
-#endif
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val32;
+    memcpy(&val32, memPtr, 4);
+    return val32;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val64;
+    memcpy(&val64, memPtr, 8);
+    return val64;
+}
 
 
 
-//**************************************
-// Compiler-specific Options & Functions
-//**************************************
+/******************************************
+*  Compiler-specific Functions and Macros
+******************************************/
 #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 
-// Note : under GCC, it may sometimes be faster to enable the (2nd) macro definition, instead of using win32 intrinsic
-#if defined(_WIN32)
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
 #else
 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
 #endif
 
-#if defined(_MSC_VER)     // Visual Studio
+#if defined(_MSC_VER)     /* Visual Studio */
 #  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
 #elif GCC_VERSION >= 403
 #  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
 #else
-static inline unsigned int XXH_swap32 (unsigned int x) {
-                        return  ((x << 24) & 0xff000000 ) |
-                                ((x <<  8) & 0x00ff0000 ) |
-                                ((x >>  8) & 0x0000ff00 ) |
-                                ((x >> 24) & 0x000000ff );
-                 }
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
 #endif
 
 
+/***************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+#ifndef XXH_CPU_LITTLE_ENDIAN   /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example using a compiler switch */
+static const int one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
+#endif
+
+
+/*****************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
 
-//**************************************
-// Constants
-//**************************************
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+
+/***************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/***************************************
+*  Constants
+***************************************/
 #define PRIME32_1   2654435761U
 #define PRIME32_2   2246822519U
 #define PRIME32_3   3266489917U
 #define PRIME32_4    668265263U
 #define PRIME32_5    374761393U
 
+#define PRIME64_1 11400714785074694791ULL
+#define PRIME64_2 14029467366897019727ULL
+#define PRIME64_3  1609587929392839161ULL
+#define PRIME64_4  9650029242287828579ULL
+#define PRIME64_5  2870177450012600261ULL
 
 
-//**************************************
-// Macros
-//**************************************
-#define XXH_LE32(p)  (XXH_BIG_ENDIAN ? XXH_swap32(*(unsigned int*)(p)) : *(unsigned int*)(p))
+/*****************************
+*  Simple Hash Functions
+*****************************/
+FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL)
+    {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_get32bits(p) * PRIME32_2;
+            v1 = XXH_rotl32(v1, 13);
+            v1 *= PRIME32_1;
+            p+=4;
+            v2 += XXH_get32bits(p) * PRIME32_2;
+            v2 = XXH_rotl32(v2, 13);
+            v2 *= PRIME32_1;
+            p+=4;
+            v3 += XXH_get32bits(p) * PRIME32_2;
+            v3 = XXH_rotl32(v3, 13);
+            v3 *= PRIME32_1;
+            p+=4;
+            v4 += XXH_get32bits(p) * PRIME32_2;
+            v4 = XXH_rotl32(v4, 13);
+            v4 *= PRIME32_1;
+            p+=4;
+        }
+        while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd)
+    {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
 
 
+unsigned XXH32 (const void* input, size_t len, unsigned seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, input, len);
+    return XXH32_digest(&state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
 
-//****************************
-// Simple Hash Functions
-//****************************
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
 
-unsigned int XXH32(const void* input, int len, unsigned int seed)
+FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL)
+    {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32)
+    {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do
+        {
+            v1 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v1 = XXH_rotl64(v1, 31);
+            v1 *= PRIME64_1;
+            v2 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v2 = XXH_rotl64(v2, 31);
+            v2 *= PRIME64_1;
+            v3 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v3 = XXH_rotl64(v3, 31);
+            v3 *= PRIME64_1;
+            v4 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v4 = XXH_rotl64(v4, 31);
+            v4 *= PRIME64_1;
+        }
+        while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+
+        v1 *= PRIME64_2;
+        v1 = XXH_rotl64(v1, 31);
+        v1 *= PRIME64_1;
+        h64 ^= v1;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v2 *= PRIME64_2;
+        v2 = XXH_rotl64(v2, 31);
+        v2 *= PRIME64_1;
+        h64 ^= v2;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v3 *= PRIME64_2;
+        v3 = XXH_rotl64(v3, 31);
+        v3 *= PRIME64_1;
+        h64 ^= v3;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v4 *= PRIME64_2;
+        v4 = XXH_rotl64(v4, 31);
+        v4 *= PRIME64_1;
+        h64 ^= v4;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+    }
+    else
+    {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd)
+    {
+        U64 k1 = XXH_get64bits(p);
+        k1 *= PRIME64_2;
+        k1 = XXH_rotl64(k1,31);
+        k1 *= PRIME64_1;
+        h64 ^= k1;
+        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd)
+    {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
 {
 #if 0
-	// Simple version, good for code maintenance, but unfortunately slow for small inputs
-	void* state = XXH32_init(seed);
-	XXH32_feed(state, input, len);
-	return XXH32_result(state);
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
 #else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
 
-	const unsigned char* p = (const unsigned char*)input;
-	const unsigned char* const bEnd = p + len;
-	unsigned int h32;
-
-	if (len>=16)
-	{
-		const unsigned char* const limit = bEnd - 16;
-		unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
-		unsigned int v2 = seed + PRIME32_2;
-		unsigned int v3 = seed + 0;
-		unsigned int v4 = seed - PRIME32_1;
-
-		do
-		{
-			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-		} while (p<=limit) ;
-
-		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-	}
-	else
-	{
-		h32  = seed + PRIME32_5;
-	}
-
-	h32 += (unsigned int) len;
-	
-	while (p<=bEnd-4)
-	{
-		h32 += XXH_LE32(p) * PRIME32_3;
-		h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
-		p+=4;
-	}
-
-	while (p<bEnd)
-	{
-		h32 += (*p) * PRIME32_5;
-		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-		p++;
-	}
-
-	h32 ^= h32 >> 15;
-	h32 *= PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= PRIME32_3;
-	h32 ^= h32 >> 16;
-
-	return h32;
+/****************************************************
+*  Advanced Hash Functions
+****************************************************/
 
+/*** Allocation ***/
+typedef struct
+{
+    U64 total_len;
+    U32 seed;
+    U32 v1;
+    U32 v2;
+    U32 v3;
+    U32 v4;
+    U32 mem32[4];   /* defined as U32 for alignment */
+    U32 memsize;
+} XXH_istate32_t;
+
+typedef struct
+{
+    U64 total_len;
+    U64 seed;
+    U64 v1;
+    U64 v2;
+    U64 v3;
+    U64 v4;
+    U64 mem64[4];   /* defined as U64 for alignment */
+    U32 memsize;
+} XXH_istate64_t;
+
+
+XXH32_state_t* XXH32_createState(void)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   /* A compilation error here means XXH32_state_t is not large enough */
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH64_state_t* XXH64_createState(void)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   /* A compilation error here means XXH64_state_t is not large enough */
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed)
+{
+    XXH_istate32_t* state = (XXH_istate32_t*) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
+{
+    XXH_istate64_t* state = (XXH_istate64_t*) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME64_1 + PRIME64_2;
+    state->v2 = seed + PRIME64_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME64_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+{
+    XXH_istate32_t* state = (XXH_istate32_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
 #endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   /* fill in tmp buffer */
+    {
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   /* some data left from previous update */
+    {
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {
+            const U32* p32 = state->mem32;
+            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v1 = XXH_rotl32(state->v1, 13);
+            state->v1 *= PRIME32_1;
+            p32++;
+            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v2 = XXH_rotl32(state->v2, 13);
+            state->v2 *= PRIME32_1;
+            p32++;
+            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v3 = XXH_rotl32(state->v3, 13);
+            state->v3 *= PRIME32_1;
+            p32++;
+            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v4 = XXH_rotl32(state->v4, 13);
+            state->v4 *= PRIME32_1;
+            p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE32(p, endian) * PRIME32_2;
+            v1 = XXH_rotl32(v1, 13);
+            v1 *= PRIME32_1;
+            p+=4;
+            v2 += XXH_readLE32(p, endian) * PRIME32_2;
+            v2 = XXH_rotl32(v2, 13);
+            v2 *= PRIME32_1;
+            p+=4;
+            v3 += XXH_readLE32(p, endian) * PRIME32_2;
+            v3 = XXH_rotl32(v3, 13);
+            v3 *= PRIME32_1;
+            p+=4;
+            v4 += XXH_readLE32(p, endian) * PRIME32_2;
+            v4 = XXH_rotl32(v4, 13);
+            v4 *= PRIME32_1;
+            p+=4;
+        }
+        while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->mem32, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
 }
 
+XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
 
-//****************************
-// Advanced Hash Functions
-//****************************
 
-struct XXH_state32_t
+
+FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
 {
-	unsigned int seed;
-	unsigned int v1;
-	unsigned int v2;
-	unsigned int v3;
-	unsigned int v4;
-	unsigned long long total_len;
-	char memory[16];
-	int memsize;
-};
-
-
-void* XXH32_init (unsigned int seed)
+    const XXH_istate32_t* state = (const XXH_istate32_t*) state_in;
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (U32) state->total_len;
+
+    while (p+4<=bEnd)
+    {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+U32 XXH32_digest (const XXH32_state_t* state_in)
 {
-	struct XXH_state32_t * state = (struct XXH_state32_t *) malloc ( sizeof(struct XXH_state32_t));
-	state->seed = seed;
-	state->v1 = seed + PRIME32_1 + PRIME32_2;
-	state->v2 = seed + PRIME32_2;
-	state->v3 = seed + 0;
-	state->v4 = seed - PRIME32_1;
-	state->total_len = 0;
-	state->memsize = 0;
-
-	return (void*)state;
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
 }
 
 
-int   XXH32_feed (void* state_in, const void* input, int len)
+FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+{
+    XXH_istate64_t * state = (XXH_istate64_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32)   /* fill in tmp buffer */
+    {
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   /* some data left from previous update */
+    {
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        {
+            const U64* p64 = state->mem64;
+            state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v1 = XXH_rotl64(state->v1, 31);
+            state->v1 *= PRIME64_1;
+            p64++;
+            state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v2 = XXH_rotl64(state->v2, 31);
+            state->v2 *= PRIME64_1;
+            p64++;
+            state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v3 = XXH_rotl64(state->v3, 31);
+            state->v3 *= PRIME64_1;
+            p64++;
+            state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v4 = XXH_rotl64(state->v4, 31);
+            state->v4 *= PRIME64_1;
+            p64++;
+        }
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd)
+    {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE64(p, endian) * PRIME64_2;
+            v1 = XXH_rotl64(v1, 31);
+            v1 *= PRIME64_1;
+            p+=8;
+            v2 += XXH_readLE64(p, endian) * PRIME64_2;
+            v2 = XXH_rotl64(v2, 31);
+            v2 *= PRIME64_1;
+            p+=8;
+            v3 += XXH_readLE64(p, endian) * PRIME64_2;
+            v3 = XXH_rotl64(v3, 31);
+            v3 *= PRIME64_1;
+            p+=8;
+            v4 += XXH_readLE64(p, endian) * PRIME64_2;
+            v4 = XXH_rotl64(v4, 31);
+            v4 *= PRIME64_1;
+            p+=8;
+        }
+        while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->mem64, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
 {
-	struct XXH_state32_t * state = state_in;
-	const unsigned char* p = (const unsigned char*)input;
-	const unsigned char* const bEnd = p + len;
-
-	state->total_len += len;
-	
-	if (state->memsize + len < 16)   // fill in tmp buffer
-	{
-		memcpy(state->memory + state->memsize, input, len);
-		state->memsize +=  len;
-		return 0;
-	}
-
-	if (state->memsize)   // some data left from previous feed
-	{
-		memcpy(state->memory + state->memsize, input, 16-state->memsize);
-		{
-			const unsigned int* p32 = (const unsigned int*)state->memory;
-			state->v1 += XXH_LE32(p32) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
-			state->v2 += XXH_LE32(p32) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; 
-			state->v3 += XXH_LE32(p32) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
-			state->v4 += XXH_LE32(p32) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
-		}
-		p += 16-state->memsize;
-		state->memsize = 0;
-	}
-
-	{
-		const unsigned char* const limit = bEnd - 16;
-		unsigned int v1 = state->v1;
-		unsigned int v2 = state->v2;
-		unsigned int v3 = state->v3;
-		unsigned int v4 = state->v4;
-
-		while (p<=limit)
-		{
-			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-		}  
-
-		state->v1 = v1;
-		state->v2 = v2;
-		state->v3 = v3;
-		state->v4 = v4;
-	}
-
-	if (p < bEnd)
-	{
-		memcpy(state->memory, p, bEnd-p);
-		state->memsize = bEnd-p;
-	}
-
-	return 0;
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
 }
 
 
-unsigned int XXH32_getIntermediateResult (void* state_in)
+
+FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
 {
-	struct XXH_state32_t * state = state_in;
-	unsigned char * p   = (unsigned char*)state->memory;
-	unsigned char* bEnd = (unsigned char*)state->memory + state->memsize;
-	unsigned int h32;
-
-
-	if (state->total_len >= 16)
-	{
-		h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
-	}
-	else
-	{
-		h32  = state->seed + PRIME32_5;
-	}
-
-	h32 += (unsigned int) state->total_len;
-	
-	while (p<=bEnd-4)
-	{
-		h32 += XXH_LE32(p) * PRIME32_3;
-		h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
-		p+=4;
-	}
-
-	while (p<bEnd)
-	{
-		h32 += (*p) * PRIME32_5;
-		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-		p++;
-	}
-
-	h32 ^= h32 >> 15;
-	h32 *= PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= PRIME32_3;
-	h32 ^= h32 >> 16;
-
-	return h32;
+    const XXH_istate64_t * state = (const XXH_istate64_t *) state_in;
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32)
+    {
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+
+        v1 *= PRIME64_2;
+        v1 = XXH_rotl64(v1, 31);
+        v1 *= PRIME64_1;
+        h64 ^= v1;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v2 *= PRIME64_2;
+        v2 = XXH_rotl64(v2, 31);
+        v2 *= PRIME64_1;
+        h64 ^= v2;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v3 *= PRIME64_2;
+        v3 = XXH_rotl64(v3, 31);
+        v3 *= PRIME64_1;
+        h64 ^= v3;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v4 *= PRIME64_2;
+        v4 = XXH_rotl64(v4, 31);
+        v4 *= PRIME64_1;
+        h64 ^= v4;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+    }
+    else
+    {
+        h64  = state->seed + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd)
+    {
+        U64 k1 = XXH_readLE64(p, endian);
+        k1 *= PRIME64_2;
+        k1 = XXH_rotl64(k1,31);
+        k1 *= PRIME64_1;
+        h64 ^= k1;
+        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd)
+    {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
 }
 
 
-unsigned int XXH32_result (void* state_in)
+unsigned long long XXH64_digest (const XXH64_state_t* state_in)
 {
-    unsigned int h32 = XXH32_getIntermediateResult(state_in);
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
 
-	free(state_in);
 
-	return h32;
-}
\ No newline at end of file
diff --git a/xxhash.h b/xxhash.h
index a11e303..c60aa61 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -1,20 +1,21 @@
 /*
-   xxHash - Fast Hash algorithm
+   xxHash - Extremely Fast Hash algorithm
    Header File
-   Copyright (C) 2012, Yann Collet.
+   Copyright (C) 2012-2015, Yann Collet.
+
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
-  
+
        * Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above
    copyright notice, this list of conditions and the following disclaimer
    in the documentation and/or other materials provided with the
    distribution.
-  
+
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,8 +28,8 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-	You can contact the author at :
-	- xxHash source repository : http://code.google.com/p/xxhash/
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 /* Notice extracted from xxHash homepage :
@@ -52,9 +53,15 @@ CRC32           0.43 GB/s     9
 MD5-32          0.33 GB/s    10       Ronald L. Rivest
 SHA1-32         0.28 GB/s    10
 
-Q.Score is a measure of quality of the hash function. 
-It depends on successfully passing SMHasher test set. 
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
 10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
 */
 
 #pragma once
@@ -64,65 +71,122 @@ extern "C" {
 #endif
 
 
-//****************************
-// Simple Hash Functions
-//****************************
+/*****************************
+*  Definitions
+*****************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*****************************
+*  Namespace Emulation
+*****************************/
+/* Motivations :
+
+If you need to include xxHash into your library,
+but wish to avoid xxHash symbols to be present on your library interface
+in an effort to avoid potential name collision if another library also includes xxHash,
+
+you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
+with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
+
+Note that no change is required within the calling program :
+it can still call xxHash functions using their regular name.
+They will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#endif
+
+
+/*****************************
+*  Simple Hash Functions
+*****************************/
 
-unsigned int XXH32 (const void* input, int len, unsigned int seed);
+unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
+unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
 
 /*
 XXH32() :
-	Calculate the 32-bits hash of "input", of length "len"
-	"seed" can be used to alter the result
-	This function successfully passes all SMHasher tests.
-	Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-	Note that "len" is type "int", which means it is limited to 2^31-1.
-	If your data is larger, use the advanced functions below.
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    Faster on 64-bits systems. Slower on 32-bits systems.
 */
 
 
 
-//****************************
-// Advanced Hash Functions
-//****************************
-
-void*        XXH32_init   (unsigned int seed);
-int          XXH32_feed   (void* state, const void* input, int len);
-unsigned int XXH32_result (void* state);
+/*****************************
+*  Advanced Hash Functions
+*****************************/
+typedef struct { long long ll[ 6]; } XXH32_state_t;
+typedef struct { long long ll[11]; } XXH64_state_t;
 
 /*
-These functions calculate the xxhash of an input provided in several small packets,
-as opposed to an input provided as a single block.
+These structures allow static allocation of XXH states.
+States must then be initialized using XXHnn_reset() before first use.
 
-You must start with :
-void* XXH32_init()
-The function returns a pointer which holds the state of calculation.
+If you prefer dynamic allocation, please refer to functions below.
+*/
 
-This pointer must be provided as "void* state" parameter for XXH32_feed().
-XXH32_feed() can be called as many times as necessary.
-The function returns an error code, with 0 meaning OK, and all other values meaning there is an error.
-Note that "len" is type "int", which means it is limited to 2^31-1. 
-If your data is larger, it is recommended
-to chunk your data into blocks of size 2^30 (1GB) to avoid any "int" overflow issue.
+XXH32_state_t* XXH32_createState(void);
+XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
 
-Finally, you can end the calculation anytime, by using XXH32_result().
-This function returns the final 32-bits hash.
-You must provide the same "void* state" parameter created by XXH32_init().
+XXH64_state_t* XXH64_createState(void);
+XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
 
-Memory will be freed by XXH32_result().
+/*
+These functions create and release memory for XXH state.
+States must then be initialized using XXHnn_reset() before first use.
 */
 
 
-unsigned int XXH32_getIntermediateResult (void* state);
+XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
+XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
+
 /*
-This function does the same as XXH32_result(), generating a 32-bit hash,
-but preserve memory context.
-This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_feed().
-To free memory context, use XXH32_result().
-*/
+These functions calculate the xxHash of an input provided in multiple smaller packets,
+as opposed to an input provided as a single block.
 
+XXH state space must first be allocated, using either static or dynamic method provided above.
+
+Start a new hash by initializing state with a seed, using XXHnn_reset().
+
+Then, feed the hash state by calling XXHnn_update() as many times as necessary.
+Obviously, input must be valid, meaning allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, you can produce a hash anytime, by using XXHnn_digest().
+This function returns the final nn-bits hash.
+You can nonetheless continue feeding the hash state with more input,
+and therefore get some new hashes, by calling again XXHnn_digest().
+
+When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
+*/
 
 
 #if defined (__cplusplus)
 }
-#endif
\ No newline at end of file
+#endif

From 2fe5a368b652edae1253177c641407479553b148 Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Mon, 6 Jul 2015 11:49:20 +1000
Subject: [PATCH 03/21] Add xxhash64 support

---
 php_xxhash.c | 24 +++++++++++++++++++++++-
 php_xxhash.h |  3 ++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/php_xxhash.c b/php_xxhash.c
index fc73bf7..0a8ae76 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -38,7 +38,7 @@ PHP_MINFO_FUNCTION(xxhash)
 	php_info_print_table_start();
 	php_info_print_table_header(2, "xxhash support", "enabled");
 	php_info_print_table_row(2, "extension version", PHP_XXHASH_VERSION);
-	php_info_print_table_row(2, "xxhash release", "http://code.google.com/p/xxhash/source/detail?r=6");
+	php_info_print_table_row(2, "xxhash release", "r40");
 	php_info_print_table_end();
 }
 
@@ -62,8 +62,30 @@ PHP_FUNCTION(xxhash32)
 	RETURN_LONG((long)sum);
 }
 
+PHP_FUNCTION(xxhash64)
+{
+	char *arg1 = NULL;
+	char *ret1 = NULL;
+	int arg1_len;
+	unsigned long long sum;
+
+	/* parse the parameters */
+	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
+	{
+		RETURN_NULL();
+	}
+
+	/* compute the checksum */
+	sum = XXH64(arg1, arg1_len, 0);
+
+	/* return the checksum */
+	RETURN_LONG((long long)sum);
+}
+
+
 zend_function_entry xxhash_functions[] = {
 	PHP_FE(xxhash32, NULL)
+	PHP_FE(xxhash64, NULL)
 	{NULL, NULL, NULL}
 };
 
diff --git a/php_xxhash.h b/php_xxhash.h
index 00b2266..b28946f 100644
--- a/php_xxhash.h
+++ b/php_xxhash.h
@@ -1,7 +1,7 @@
 #ifndef PHP_XXHASH_H
 #define PHP_XXHASH_H
 
-#define PHP_XXHASH_VERSION "1.0.1"
+#define PHP_XXHASH_VERSION "1.1.0"
 
 extern zend_module_entry xxhash_module_entry;
 #define phpext_xxhash_ptr &xxhash_module_entry
@@ -23,6 +23,7 @@ PHP_RSHUTDOWN_FUNCTION(xxhash);
 PHP_MINFO_FUNCTION(xxhash);
 
 PHP_FUNCTION(xxhash32);
+PHP_FUNCTION(xxhash64);
 
 #endif /* PHP_XXHASH_H */
 

From e8b8c48b08df25fa01242270003ad08d6dfe396a Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Mon, 6 Jul 2015 12:51:06 +1000
Subject: [PATCH 04/21] Update project readme to be more useful

---
 README.md | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 66314ca..6ce0ef9 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,12 @@
-# php-xxhash
+PHP-xxHash
+==========
 
-PHP extension to add support for the [xxhash](http://code.google.com/p/xxhash/) fast hashing algorithm.  _xxhash_ is designed to be fast enough to use in real-time streaming applications.
+A PHP extension to add support for the [xxHash](https://github.com/Cyan4973/xxHash) fast hashing algorithm.
 
-## How To Install
+[xxHash](https://github.com/Cyan4973/xxHash) is an Extremely fast Hash algorithm, running at RAM speed limits. 
+It successfully completes the [SMHasher](http://code.google.com/p/smhasher/wiki/SMHasher) test suite which evaluates collision, dispersion and randomness qualities of hash functions.
+
+## Installation Instructions
 
 ```
    phpize
@@ -10,17 +14,24 @@ PHP extension to add support for the [xxhash](http://code.google.com/p/xxhash/)
    make
    sudo make install
 ```
+Don't forget to load the extension in via php.ini or the like.
 
-## How To Use
+## Usage Instructions
 
-This extension adds one new PHP function:
+Upon installation and enabling the extension within php.ini the following two new functions will be available to you:
 
 ```
-    int xxhash32(string $data);
+int xxhash32(string $data);
+int xxhash64(string $data);
 ```
 
-It will checksum the string, and return the checksum.
+In both cases an integer will be returned, representing the hash of the $data input.
+
+
+## Credits
+* Original implementation of [php-xxhash](https://github.com/stuartherbert/php-xxhash) by Stuart Herbert.
+
 
-## License
+## Licence
 
 BSD 2-clause license.
\ No newline at end of file

From 8a9ef42799f6defc2d53d03e69118516337e1e03 Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Mon, 6 Jul 2015 12:54:18 +1000
Subject: [PATCH 05/21] Add xxHash to credits

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6ce0ef9..7c05a6d 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ In both cases an integer will be returned, representing the hash of the $data in
 
 ## Credits
 * Original implementation of [php-xxhash](https://github.com/stuartherbert/php-xxhash) by Stuart Herbert.
-
+* [xxHash](https://github.com/Cyan4973/xxHash) by Yann Collet.
 
 ## Licence
 

From dc75984e2f7bfe1ce6afbae25aff1b3270fa3ce1 Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Mon, 6 Jul 2015 14:11:45 +1000
Subject: [PATCH 06/21] Have the two xxHash functions return a digest rather
 than an integer.

---
 README.md    |  6 +++---
 php_xxhash.c | 15 +++++++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 7c05a6d..77afa7d 100644
--- a/README.md
+++ b/README.md
@@ -21,11 +21,11 @@ Don't forget to load the extension in via php.ini or the like.
 Upon installation and enabling the extension within php.ini the following two new functions will be available to you:
 
 ```
-int xxhash32(string $data);
-int xxhash64(string $data);
+string xxhash32(string $data);
+string xxhash64(string $data);
 ```
 
-In both cases an integer will be returned, representing the hash of the $data input.
+In both cases a string will be returned, representing the digest (hash) of the $data input.
 
 
 ## Credits
diff --git a/php_xxhash.c b/php_xxhash.c
index 0a8ae76..b642c8d 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -45,7 +45,7 @@ PHP_MINFO_FUNCTION(xxhash)
 PHP_FUNCTION(xxhash32)
 {
 	char *arg1 = NULL;
-	char *ret1 = NULL;
+	char ret1[128];
 	int arg1_len;
 	unsigned int sum;
 
@@ -58,16 +58,20 @@ PHP_FUNCTION(xxhash32)
 	/* compute the checksum */
 	sum = XXH32(arg1, arg1_len, 0);
 
+	/* convert to hash */
+	sprintf(ret1, "%08x", sum);
+
 	/* return the checksum */
-	RETURN_LONG((long)sum);
+	RETURN_STRING(ret1, 1);
 }
 
 PHP_FUNCTION(xxhash64)
 {
 	char *arg1 = NULL;
-	char *ret1 = NULL;
+	char ret1[128];
 	int arg1_len;
 	unsigned long long sum;
+	typedef unsigned int U32;
 
 	/* parse the parameters */
 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
@@ -78,8 +82,11 @@ PHP_FUNCTION(xxhash64)
 	/* compute the checksum */
 	sum = XXH64(arg1, arg1_len, 0);
 
+	/* convert to hash */
+	sprintf(ret1, "%08x%08x", (U32)(sum >> 32), (U32)sum);
+
 	/* return the checksum */
-	RETURN_LONG((long long)sum);
+	RETURN_STRING(ret1, 1);
 }
 
 

From ea9b9e28020d1d22bf2b4e75db3e74a772982337 Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Fri, 10 Jul 2015 15:27:47 +1000
Subject: [PATCH 07/21] Working PHP7 version

---
 php_xxhash.c | 93 ++++++++++++++++++++--------------------------------
 php_xxhash.h | 30 +++++++++--------
 2 files changed, 51 insertions(+), 72 deletions(-)

diff --git a/php_xxhash.c b/php_xxhash.c
index b642c8d..6d9c5fa 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -10,29 +10,12 @@
 #include "xxhash.c"
 
 #ifdef COMPILE_DL_XXHASH
-ZEND_GET_MODULE(xxhash)
+	ZEND_GET_MODULE(xxhash);
+	#ifdef ZTS
+		ZEND_TSRMLS_CACHE_DEFINE();
+	#endif
 #endif
 
-PHP_MINIT_FUNCTION(xxhash)
-{
-	return SUCCESS;
-}
-
-PHP_MSHUTDOWN_FUNCTION(xxhash)
-{
-	return SUCCESS;
-}
-
-PHP_RINIT_FUNCTION(xxhash)
-{
-	return SUCCESS;
-}
-
-PHP_RSHUTDOWN_FUNCTION(xxhash)
-{
-	return SUCCESS;
-}
-
 PHP_MINFO_FUNCTION(xxhash)
 {
 	php_info_print_table_start();
@@ -44,68 +27,62 @@ PHP_MINFO_FUNCTION(xxhash)
 
 PHP_FUNCTION(xxhash32)
 {
-	char *arg1 = NULL;
-	char ret1[128];
-	int arg1_len;
+    char *arg = NULL;
+    size_t arg_len, len;
+    zend_string *strg;
 	unsigned int sum;
 
-	/* parse the parameters */
-	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
-	{
-		RETURN_NULL();
-	}
+    if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE || arg_len < 1) {
+        return;
+    }
 
-	/* compute the checksum */
-	sum = XXH32(arg1, arg1_len, 0);
+	// compute the checksum
+	sum = XXH32(arg, arg_len, 0);
 
-	/* convert to hash */
-	sprintf(ret1, "%08x", sum);
+	//convert to a hex string
+	strg = strpprintf(0, "%08x", sum);
 
-	/* return the checksum */
-	RETURN_STRING(ret1, 1);
+	// return the checksum
+	RETURN_STR(strg);
 }
 
 PHP_FUNCTION(xxhash64)
 {
-	char *arg1 = NULL;
-	char ret1[128];
-	int arg1_len;
+	char *arg = NULL;
+    size_t arg_len, len;
+    zend_string *strg;
 	unsigned long long sum;
-	typedef unsigned int U32;
+	//typedef unsigned int U32;
 
-	/* parse the parameters */
-	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
-	{
-		RETURN_NULL();
-	}
+    if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE || arg_len < 1) {
+        return;
+    }
 
-	/* compute the checksum */
-	sum = XXH64(arg1, arg1_len, 0);
+	// compute the checksum
+	sum = XXH64(arg, arg_len, 0);
 
-	/* convert to hash */
-	sprintf(ret1, "%08x%08x", (U32)(sum >> 32), (U32)sum);
+	//convert to a hex string
+	strg = strpprintf(0, "%08x%08x", (U32)(sum >> 32), (U32)sum);
 
-	/* return the checksum */
-	RETURN_STRING(ret1, 1);
+	// return the checksum
+	RETURN_STR(strg);
 }
 
-
-zend_function_entry xxhash_functions[] = {
-	PHP_FE(xxhash32, NULL)
-	PHP_FE(xxhash64, NULL)
-	{NULL, NULL, NULL}
+const zend_function_entry xxhash_functions[] = {
+	ZEND_FE(xxhash32, NULL)
+	ZEND_FE(xxhash64, NULL)
+	PHP_FE_END
 };
 
 zend_module_entry xxhash_module_entry = {
 	STANDARD_MODULE_HEADER,
 	"xxhash",
 	xxhash_functions,
-	PHP_MINIT(xxhash),
-	PHP_MSHUTDOWN(xxhash),
+	NULL,
+	NULL,
 	NULL,
 	NULL,
 	PHP_MINFO(xxhash),
 	PHP_XXHASH_VERSION,
 	STANDARD_MODULE_PROPERTIES
 };
-
diff --git a/php_xxhash.h b/php_xxhash.h
index b28946f..91c95a2 100644
--- a/php_xxhash.h
+++ b/php_xxhash.h
@@ -1,29 +1,31 @@
 #ifndef PHP_XXHASH_H
 #define PHP_XXHASH_H
 
-#define PHP_XXHASH_VERSION "1.1.0"
-
 extern zend_module_entry xxhash_module_entry;
 #define phpext_xxhash_ptr &xxhash_module_entry
 
-#if defined(PHP_WIN32) && defined(XXHASH_EXPORTS)
-#define PHP_XXHASH_API __declspec(dllexport)
+#define PHP_XXHASH_VERSION "2.0"
+
+#ifdef PHP_WIN32
+#   define PHP_XXHASH_API __declspec(dllexport)
+#elif defined(__GNUC__) && __GNUC__ >= 4
+#   define PHP_XXHASH_API __attribute__ ((visibility("default")))
 #else
-#define PHP_XXHASH_API PHPAPI
+#   define PHP_XXHASH_API
 #endif
 
 #ifdef ZTS
 #include "TSRM.h"
 #endif
 
-PHP_MINIT_FUNCTION(xxhash);
-PHP_MSHUTDOWN_FUNCTION(xxhash);
-PHP_RINIT_FUNCTION(xxhash);
-PHP_RSHUTDOWN_FUNCTION(xxhash);
-PHP_MINFO_FUNCTION(xxhash);
 
-PHP_FUNCTION(xxhash32);
-PHP_FUNCTION(xxhash64);
-
-#endif /* PHP_XXHASH_H */
+#ifdef ZTS
+#define XXHASH_G(v) ZEND_TSRMG(xxhash_globals_id, zend_xxhash_globals *, v)
+#ifdef COMPILE_DL_XXHASH
+ZEND_TSRMLS_CACHE_EXTERN();
+#endif
+#else
+#define XXHASH_G(v) (xxhash_globals.v)
+#endif
 
+#endif  /* PHP_XXHASH_H */
\ No newline at end of file

From faf30477ed24a69f210305e0a9f318f75c4a5140 Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Mon, 13 Jul 2015 16:24:10 +1000
Subject: [PATCH 08/21] Cleanup

---
 php_xxhash.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/php_xxhash.c b/php_xxhash.c
index 6d9c5fa..e7a8c08 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -52,7 +52,6 @@ PHP_FUNCTION(xxhash64)
     size_t arg_len, len;
     zend_string *strg;
 	unsigned long long sum;
-	//typedef unsigned int U32;
 
     if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE || arg_len < 1) {
         return;

From e7975b8a4cbdae6e38f1fd2f44e497eb4d911bcb Mon Sep 17 00:00:00 2001
From: Craig R <megasaxon@gmail.com>
Date: Mon, 13 Jul 2015 16:27:36 +1000
Subject: [PATCH 09/21] Update readme to include information regarding PHP7

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 77afa7d..425a3b1 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,10 @@ A PHP extension to add support for the [xxHash](https://github.com/Cyan4973/xxHa
 [xxHash](https://github.com/Cyan4973/xxHash) is an Extremely fast Hash algorithm, running at RAM speed limits. 
 It successfully completes the [SMHasher](http://code.google.com/p/smhasher/wiki/SMHasher) test suite which evaluates collision, dispersion and randomness qualities of hash functions.
 
+## PHP7 Compatability
+
+Please note at the moment the master branch should be used for PHP 5.x and develop branch should be used for PHP 7.x
+
 ## Installation Instructions
 
 ```

From 10ab8cac9253cbd92468a88c2fdf37ebc66a3378 Mon Sep 17 00:00:00 2001
From: Nir Heimann <nir@gobeepit.com>
Date: Sun, 20 Sep 2015 17:33:41 +0300
Subject: [PATCH 10/21] Updating the base library to the latest version

* Adding support for the 64bit version of the xxhash library
---
 README.md    |   14 +-
 php_xxhash.c |   44 ++
 php_xxhash.h |    2 +
 xxhash.c     | 1166 ++++++++++++++++++++++++++++++++++++++------------
 xxhash.h     |  158 +++++--
 5 files changed, 1061 insertions(+), 323 deletions(-)

diff --git a/README.md b/README.md
index 66314ca..6375612 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # php-xxhash
 
-PHP extension to add support for the [xxhash](http://code.google.com/p/xxhash/) fast hashing algorithm.  _xxhash_ is designed to be fast enough to use in real-time streaming applications.
+PHP extension to add support for the [xxhash - r42](https://github.com/Cyan4973/xxHash) extremely fast hashing algorithm.  _xxhash_ is designed to be fast enough to use in real-time streaming applications.
+
 
 ## How To Install
 
@@ -13,13 +14,20 @@ PHP extension to add support for the [xxhash](http://code.google.com/p/xxhash/)
 
 ## How To Use
 
-This extension adds one new PHP function:
+This extension adds three new PHP functions:
 
 ```
+    // 32 bit version (all values are positive)
     int xxhash32(string $data);
+    
+    // 64 bit version (can return negative values since PHP doesn't support unsigned long values)
+    long xxhash64(string $data);
+    
+    // 64 bit version (all values are positive but returned as strings)
+    string xxhash64Unsigned(string $data);
 ```
 
-It will checksum the string, and return the checksum.
+They will checksum the string, and return the checksum.
 
 ## License
 
diff --git a/php_xxhash.c b/php_xxhash.c
index fc73bf7..1525981 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -62,8 +62,52 @@ PHP_FUNCTION(xxhash32)
 	RETURN_LONG((long)sum);
 }
 
+PHP_FUNCTION(xxhash64)
+{
+	char *arg1 = NULL;
+	char *ret1 = NULL;
+	int arg1_len;
+	unsigned long long sum;
+
+	/* parse the parameters */
+	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
+	{
+		RETURN_NULL();
+	}
+
+	/* compute the checksum */
+	sum = XXH64(arg1, arg1_len, 0);
+
+	/* return the checksum */
+	/* Negative values can be returned since we cannot return  unsigned long to php */
+	RETURN_LONG(sum); 
+}
+
+PHP_FUNCTION(xxhash64Unsigned)
+{
+	char *arg1 = NULL;
+	char *ret1 = NULL;
+	int arg1_len;
+	unsigned long long sum;
+
+	/* parse the parameters */
+	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
+	{
+		RETURN_NULL();
+	}
+
+	/* compute the checksum */
+	sum = XXH64(arg1, arg1_len, 0);
+	
+	//Since php doesn't have unsinged long values, the value will be returned as string
+	char numberAsAString[20];
+	sprintf(numberAsAString, "%llu",sum);
+	RETURN_STRING(numberAsAString, 1);
+}
 zend_function_entry xxhash_functions[] = {
 	PHP_FE(xxhash32, NULL)
+	PHP_FE(xxhash64, NULL)
+	PHP_FE(xxhash64Unsigned, NULL)
 	{NULL, NULL, NULL}
 };
 
diff --git a/php_xxhash.h b/php_xxhash.h
index 00b2266..c71241d 100644
--- a/php_xxhash.h
+++ b/php_xxhash.h
@@ -23,6 +23,8 @@ PHP_RSHUTDOWN_FUNCTION(xxhash);
 PHP_MINFO_FUNCTION(xxhash);
 
 PHP_FUNCTION(xxhash32);
+PHP_FUNCTION(xxhash64);
+PHP_FUNCTION(xxhash64Unsigned);
 
 #endif /* PHP_XXHASH_H */
 
diff --git a/xxhash.c b/xxhash.c
index 164f74d..511d994 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -1,342 +1,962 @@
 /*
-   xxHash - Fast Hash algorithm
-   Copyright (C) 2012, Yann Collet.
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-	You can contact the author at :
-	- xxHash source repository : http://code.google.com/p/xxhash/
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2015, Yann Collet
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 
+/**************************************
+*  Tuning parameters
+**************************************/
+/* XXH_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which generate assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
 
-//**************************************
-// Tuning parameters
-//**************************************
-// FORCE_NATIVE_FORMAT :
-// By default, xxHash library provides endian-independant Hash values.
-// Results are therefore identical for big-endian and little-endian CPU.
-// This comes at a  performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independance be of no importance to your application, you may uncomment the #define below
-// It will improve speed for Big-endian CPU.
-// This option has no impact on Little_Endian CPU.
-//#define FORCE_NATIVE_FORMAT 1
+/* XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/* XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#define XXH_FORCE_NATIVE_FORMAT 0
+
+/* XXH_USELESS_ALIGN_BRANCH :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : don't make a test between aligned/unaligned, because performance will be the same.
+ * It saves one initial branch per hash.
+ */
+#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USELESS_ALIGN_BRANCH 1
+#endif
 
 
+/**************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define FORCE_INLINE static __forceinline
+#else
+#  if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
 
-//**************************************
-// Includes
-//**************************************
-#include <stdlib.h>    // for malloc(), free()
-#include <string.h>    // for memcpy()
+/**************************************
+*  Includes & Memory related functions
+***************************************/
 #include "xxhash.h"
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+
+/**************************************
+*  Basic Types
+***************************************/
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
+#endif
 
 
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
 
-//**************************************
-// CPU Feature Detection
-//**************************************
-// Little Endian or Big Endian ?
-// You can overwrite the #define below if you know your architecture endianess
-#if defined(FORCE_NATIVE_FORMAT) && (FORCE_NATIVE_FORMAT==1)
-// Force native format. The result will be endian dependant.
-#  define XXH_BIG_ENDIAN 0
-#elif defined (__GLIBC__)
-#  include <endian.h>
-#  if (__BYTE_ORDER == __BIG_ENDIAN)
-#     define XXH_BIG_ENDIAN 1
-#  endif
-#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
-#  define XXH_BIG_ENDIAN 1
-#elif defined(__sparc) || defined(__sparc__) \
-   || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
-   || defined(__hpux)  || defined(__hppa) \
-   || defined(_MIPSEB) || defined(__s390__)
-#  define XXH_BIG_ENDIAN 1
-#endif
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
 
-#if !defined(XXH_BIG_ENDIAN)
-// Little Endian assumed. PDP Endian and other very rare endian format are unsupported.
-#  define XXH_BIG_ENDIAN 0
-#endif
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
 
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
 
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
 
-//**************************************
-// Compiler-specific Options & Functions
-//**************************************
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS
+
+
+/******************************************
+*  Compiler-specific Functions and Macros
+******************************************/
 #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 
-// Note : under GCC, it may sometimes be faster to enable the (2nd) macro definition, instead of using win32 intrinsic
-#if defined(_WIN32)
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
 #else
 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
 #endif
 
-#if defined(_MSC_VER)     // Visual Studio
+#if defined(_MSC_VER)     /* Visual Studio */
 #  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
 #elif GCC_VERSION >= 403
 #  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
 #else
-static inline unsigned int XXH_swap32 (unsigned int x) {
-                        return  ((x << 24) & 0xff000000 ) |
-                                ((x <<  8) & 0x00ff0000 ) |
-                                ((x >>  8) & 0x0000ff00 ) |
-                                ((x >> 24) & 0x000000ff );
-                 }
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/***************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example one the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+    static const int one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
 #endif
 
 
+/*****************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
 
-//**************************************
-// Constants
-//**************************************
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+
+/***************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/***************************************
+*  Constants
+***************************************/
 #define PRIME32_1   2654435761U
 #define PRIME32_2   2246822519U
 #define PRIME32_3   3266489917U
 #define PRIME32_4    668265263U
 #define PRIME32_5    374761393U
 
+#define PRIME64_1 11400714785074694791ULL
+#define PRIME64_2 14029467366897019727ULL
+#define PRIME64_3  1609587929392839161ULL
+#define PRIME64_4  9650029242287828579ULL
+#define PRIME64_5  2870177450012600261ULL
 
 
-//**************************************
-// Macros
-//**************************************
-#define XXH_LE32(p)  (XXH_BIG_ENDIAN ? XXH_swap32(*(unsigned int*)(p)) : *(unsigned int*)(p))
+/*****************************
+*  Simple Hash Functions
+*****************************/
+FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL)
+    {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
 
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_get32bits(p) * PRIME32_2;
+            v1 = XXH_rotl32(v1, 13);
+            v1 *= PRIME32_1;
+            p+=4;
+            v2 += XXH_get32bits(p) * PRIME32_2;
+            v2 = XXH_rotl32(v2, 13);
+            v2 *= PRIME32_1;
+            p+=4;
+            v3 += XXH_get32bits(p) * PRIME32_2;
+            v3 = XXH_rotl32(v3, 13);
+            v3 *= PRIME32_1;
+            p+=4;
+            v4 += XXH_get32bits(p) * PRIME32_2;
+            v4 = XXH_rotl32(v4, 13);
+            v4 *= PRIME32_1;
+            p+=4;
+        }
+        while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd)
+    {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
 
 
-//****************************
-// Simple Hash Functions
-//****************************
+unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, input, len);
+    return XXH32_digest(&state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USELESS_ALIGN_BRANCH)
+    if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
 
-unsigned int XXH32(const void* input, int len, unsigned int seed)
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL)
+    {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32)
+    {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do
+        {
+            v1 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v1 = XXH_rotl64(v1, 31);
+            v1 *= PRIME64_1;
+            v2 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v2 = XXH_rotl64(v2, 31);
+            v2 *= PRIME64_1;
+            v3 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v3 = XXH_rotl64(v3, 31);
+            v3 *= PRIME64_1;
+            v4 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v4 = XXH_rotl64(v4, 31);
+            v4 *= PRIME64_1;
+        }
+        while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+
+        v1 *= PRIME64_2;
+        v1 = XXH_rotl64(v1, 31);
+        v1 *= PRIME64_1;
+        h64 ^= v1;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v2 *= PRIME64_2;
+        v2 = XXH_rotl64(v2, 31);
+        v2 *= PRIME64_1;
+        h64 ^= v2;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v3 *= PRIME64_2;
+        v3 = XXH_rotl64(v3, 31);
+        v3 *= PRIME64_1;
+        h64 ^= v3;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v4 *= PRIME64_2;
+        v4 = XXH_rotl64(v4, 31);
+        v4 *= PRIME64_1;
+        h64 ^= v4;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+    }
+    else
+    {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd)
+    {
+        U64 k1 = XXH_get64bits(p);
+        k1 *= PRIME64_2;
+        k1 = XXH_rotl64(k1,31);
+        k1 *= PRIME64_1;
+        h64 ^= k1;
+        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd)
+    {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
 {
 #if 0
-	// Simple version, good for code maintenance, but unfortunately slow for small inputs
-	void* state = XXH32_init(seed);
-	XXH32_feed(state, input, len);
-	return XXH32_result(state);
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
 #else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USELESS_ALIGN_BRANCH)
+    if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
 
-	const unsigned char* p = (const unsigned char*)input;
-	const unsigned char* const bEnd = p + len;
-	unsigned int h32;
-
-	if (len>=16)
-	{
-		const unsigned char* const limit = bEnd - 16;
-		unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
-		unsigned int v2 = seed + PRIME32_2;
-		unsigned int v3 = seed + 0;
-		unsigned int v4 = seed - PRIME32_1;
-
-		do
-		{
-			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-		} while (p<=limit) ;
-
-		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-	}
-	else
-	{
-		h32  = seed + PRIME32_5;
-	}
-
-	h32 += (unsigned int) len;
-	
-	while (p<=bEnd-4)
-	{
-		h32 += XXH_LE32(p) * PRIME32_3;
-		h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
-		p+=4;
-	}
-
-	while (p<bEnd)
-	{
-		h32 += (*p) * PRIME32_5;
-		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-		p++;
-	}
-
-	h32 ^= h32 >> 15;
-	h32 *= PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= PRIME32_3;
-	h32 ^= h32 >> 16;
-
-	return h32;
+/****************************************************
+*  Advanced Hash Functions
+****************************************************/
 
+/*** Allocation ***/
+typedef struct
+{
+    U64 total_len;
+    U32 seed;
+    U32 v1;
+    U32 v2;
+    U32 v3;
+    U32 v4;
+    U32 mem32[4];   /* defined as U32 for alignment */
+    U32 memsize;
+} XXH_istate32_t;
+
+typedef struct
+{
+    U64 total_len;
+    U64 seed;
+    U64 v1;
+    U64 v2;
+    U64 v3;
+    U64 v4;
+    U64 mem64[4];   /* defined as U64 for alignment */
+    U32 memsize;
+} XXH_istate64_t;
+
+
+XXH32_state_t* XXH32_createState(void)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   /* A compilation error here means XXH32_state_t is not large enough */
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH64_state_t* XXH64_createState(void)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   /* A compilation error here means XXH64_state_t is not large enough */
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_errorcode XXH32_reset(XXH32_state_t* state_in, unsigned int seed)
+{
+    XXH_istate32_t* state = (XXH_istate32_t*) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
+{
+    XXH_istate64_t* state = (XXH_istate64_t*) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME64_1 + PRIME64_2;
+    state->v2 = seed + PRIME64_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME64_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+{
+    XXH_istate32_t* state = (XXH_istate32_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
 #endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   /* fill in tmp buffer */
+    {
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   /* some data left from previous update */
+    {
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {
+            const U32* p32 = state->mem32;
+            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v1 = XXH_rotl32(state->v1, 13);
+            state->v1 *= PRIME32_1;
+            p32++;
+            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v2 = XXH_rotl32(state->v2, 13);
+            state->v2 *= PRIME32_1;
+            p32++;
+            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v3 = XXH_rotl32(state->v3, 13);
+            state->v3 *= PRIME32_1;
+            p32++;
+            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v4 = XXH_rotl32(state->v4, 13);
+            state->v4 *= PRIME32_1;
+            p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE32(p, endian) * PRIME32_2;
+            v1 = XXH_rotl32(v1, 13);
+            v1 *= PRIME32_1;
+            p+=4;
+            v2 += XXH_readLE32(p, endian) * PRIME32_2;
+            v2 = XXH_rotl32(v2, 13);
+            v2 *= PRIME32_1;
+            p+=4;
+            v3 += XXH_readLE32(p, endian) * PRIME32_2;
+            v3 = XXH_rotl32(v3, 13);
+            v3 *= PRIME32_1;
+            p+=4;
+            v4 += XXH_readLE32(p, endian) * PRIME32_2;
+            v4 = XXH_rotl32(v4, 13);
+            v4 *= PRIME32_1;
+            p+=4;
+        }
+        while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->mem32, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
 }
 
 
-//****************************
-// Advanced Hash Functions
-//****************************
 
-struct XXH_state32_t
+FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
 {
-	unsigned int seed;
-	unsigned int v1;
-	unsigned int v2;
-	unsigned int v3;
-	unsigned int v4;
-	unsigned long long total_len;
-	char memory[16];
-	int memsize;
-};
-
-
-void* XXH32_init (unsigned int seed)
+    const XXH_istate32_t* state = (const XXH_istate32_t*) state_in;
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (U32) state->total_len;
+
+    while (p+4<=bEnd)
+    {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+unsigned int XXH32_digest (const XXH32_state_t* state_in)
 {
-	struct XXH_state32_t * state = (struct XXH_state32_t *) malloc ( sizeof(struct XXH_state32_t));
-	state->seed = seed;
-	state->v1 = seed + PRIME32_1 + PRIME32_2;
-	state->v2 = seed + PRIME32_2;
-	state->v3 = seed + 0;
-	state->v4 = seed - PRIME32_1;
-	state->total_len = 0;
-	state->memsize = 0;
-
-	return (void*)state;
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
 }
 
 
-int   XXH32_feed (void* state_in, const void* input, int len)
+FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
 {
-	struct XXH_state32_t * state = state_in;
-	const unsigned char* p = (const unsigned char*)input;
-	const unsigned char* const bEnd = p + len;
-
-	state->total_len += len;
-	
-	if (state->memsize + len < 16)   // fill in tmp buffer
-	{
-		memcpy(state->memory + state->memsize, input, len);
-		state->memsize +=  len;
-		return 0;
-	}
-
-	if (state->memsize)   // some data left from previous feed
-	{
-		memcpy(state->memory + state->memsize, input, 16-state->memsize);
-		{
-			const unsigned int* p32 = (const unsigned int*)state->memory;
-			state->v1 += XXH_LE32(p32) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
-			state->v2 += XXH_LE32(p32) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; 
-			state->v3 += XXH_LE32(p32) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
-			state->v4 += XXH_LE32(p32) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
-		}
-		p += 16-state->memsize;
-		state->memsize = 0;
-	}
-
-	{
-		const unsigned char* const limit = bEnd - 16;
-		unsigned int v1 = state->v1;
-		unsigned int v2 = state->v2;
-		unsigned int v3 = state->v3;
-		unsigned int v4 = state->v4;
-
-		while (p<=limit)
-		{
-			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-		}  
-
-		state->v1 = v1;
-		state->v2 = v2;
-		state->v3 = v3;
-		state->v4 = v4;
-	}
-
-	if (p < bEnd)
-	{
-		memcpy(state->memory, p, bEnd-p);
-		state->memsize = bEnd-p;
-	}
-
-	return 0;
+    XXH_istate64_t * state = (XXH_istate64_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32)   /* fill in tmp buffer */
+    {
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   /* some data left from previous update */
+    {
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        {
+            const U64* p64 = state->mem64;
+            state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v1 = XXH_rotl64(state->v1, 31);
+            state->v1 *= PRIME64_1;
+            p64++;
+            state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v2 = XXH_rotl64(state->v2, 31);
+            state->v2 *= PRIME64_1;
+            p64++;
+            state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v3 = XXH_rotl64(state->v3, 31);
+            state->v3 *= PRIME64_1;
+            p64++;
+            state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v4 = XXH_rotl64(state->v4, 31);
+            state->v4 *= PRIME64_1;
+            p64++;
+        }
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd)
+    {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE64(p, endian) * PRIME64_2;
+            v1 = XXH_rotl64(v1, 31);
+            v1 *= PRIME64_1;
+            p+=8;
+            v2 += XXH_readLE64(p, endian) * PRIME64_2;
+            v2 = XXH_rotl64(v2, 31);
+            v2 *= PRIME64_1;
+            p+=8;
+            v3 += XXH_readLE64(p, endian) * PRIME64_2;
+            v3 = XXH_rotl64(v3, 31);
+            v3 *= PRIME64_1;
+            p+=8;
+            v4 += XXH_readLE64(p, endian) * PRIME64_2;
+            v4 = XXH_rotl64(v4, 31);
+            v4 *= PRIME64_1;
+            p+=8;
+        }
+        while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->mem64, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
 }
 
+XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
-unsigned int XXH32_getIntermediateResult (void* state_in)
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
 {
-	struct XXH_state32_t * state = state_in;
-	unsigned char * p   = (unsigned char*)state->memory;
-	unsigned char* bEnd = (unsigned char*)state->memory + state->memsize;
-	unsigned int h32;
-
-
-	if (state->total_len >= 16)
-	{
-		h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
-	}
-	else
-	{
-		h32  = state->seed + PRIME32_5;
-	}
-
-	h32 += (unsigned int) state->total_len;
-	
-	while (p<=bEnd-4)
-	{
-		h32 += XXH_LE32(p) * PRIME32_3;
-		h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
-		p+=4;
-	}
-
-	while (p<bEnd)
-	{
-		h32 += (*p) * PRIME32_5;
-		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-		p++;
-	}
-
-	h32 ^= h32 >> 15;
-	h32 *= PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= PRIME32_3;
-	h32 ^= h32 >> 16;
-
-	return h32;
+    const XXH_istate64_t * state = (const XXH_istate64_t *) state_in;
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32)
+    {
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+
+        v1 *= PRIME64_2;
+        v1 = XXH_rotl64(v1, 31);
+        v1 *= PRIME64_1;
+        h64 ^= v1;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v2 *= PRIME64_2;
+        v2 = XXH_rotl64(v2, 31);
+        v2 *= PRIME64_1;
+        h64 ^= v2;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v3 *= PRIME64_2;
+        v3 = XXH_rotl64(v3, 31);
+        v3 *= PRIME64_1;
+        h64 ^= v3;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v4 *= PRIME64_2;
+        v4 = XXH_rotl64(v4, 31);
+        v4 *= PRIME64_1;
+        h64 ^= v4;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+    }
+    else
+    {
+        h64  = state->seed + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd)
+    {
+        U64 k1 = XXH_readLE64(p, endian);
+        k1 *= PRIME64_2;
+        k1 = XXH_rotl64(k1,31);
+        k1 *= PRIME64_1;
+        h64 ^= k1;
+        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd)
+    {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
 }
 
 
-unsigned int XXH32_result (void* state_in)
+unsigned long long XXH64_digest (const XXH64_state_t* state_in)
 {
-    unsigned int h32 = XXH32_getIntermediateResult(state_in);
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
 
-	free(state_in);
 
-	return h32;
-}
\ No newline at end of file
diff --git a/xxhash.h b/xxhash.h
index a11e303..c60aa61 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -1,20 +1,21 @@
 /*
-   xxHash - Fast Hash algorithm
+   xxHash - Extremely Fast Hash algorithm
    Header File
-   Copyright (C) 2012, Yann Collet.
+   Copyright (C) 2012-2015, Yann Collet.
+
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
-  
+
        * Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above
    copyright notice, this list of conditions and the following disclaimer
    in the documentation and/or other materials provided with the
    distribution.
-  
+
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,8 +28,8 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-	You can contact the author at :
-	- xxHash source repository : http://code.google.com/p/xxhash/
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 /* Notice extracted from xxHash homepage :
@@ -52,9 +53,15 @@ CRC32           0.43 GB/s     9
 MD5-32          0.33 GB/s    10       Ronald L. Rivest
 SHA1-32         0.28 GB/s    10
 
-Q.Score is a measure of quality of the hash function. 
-It depends on successfully passing SMHasher test set. 
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
 10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
 */
 
 #pragma once
@@ -64,65 +71,122 @@ extern "C" {
 #endif
 
 
-//****************************
-// Simple Hash Functions
-//****************************
+/*****************************
+*  Definitions
+*****************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*****************************
+*  Namespace Emulation
+*****************************/
+/* Motivations :
+
+If you need to include xxHash into your library,
+but wish to avoid xxHash symbols to be present on your library interface
+in an effort to avoid potential name collision if another library also includes xxHash,
+
+you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
+with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
+
+Note that no change is required within the calling program :
+it can still call xxHash functions using their regular name.
+They will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#endif
+
+
+/*****************************
+*  Simple Hash Functions
+*****************************/
 
-unsigned int XXH32 (const void* input, int len, unsigned int seed);
+unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
+unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
 
 /*
 XXH32() :
-	Calculate the 32-bits hash of "input", of length "len"
-	"seed" can be used to alter the result
-	This function successfully passes all SMHasher tests.
-	Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-	Note that "len" is type "int", which means it is limited to 2^31-1.
-	If your data is larger, use the advanced functions below.
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    Faster on 64-bits systems. Slower on 32-bits systems.
 */
 
 
 
-//****************************
-// Advanced Hash Functions
-//****************************
-
-void*        XXH32_init   (unsigned int seed);
-int          XXH32_feed   (void* state, const void* input, int len);
-unsigned int XXH32_result (void* state);
+/*****************************
+*  Advanced Hash Functions
+*****************************/
+typedef struct { long long ll[ 6]; } XXH32_state_t;
+typedef struct { long long ll[11]; } XXH64_state_t;
 
 /*
-These functions calculate the xxhash of an input provided in several small packets,
-as opposed to an input provided as a single block.
+These structures allow static allocation of XXH states.
+States must then be initialized using XXHnn_reset() before first use.
 
-You must start with :
-void* XXH32_init()
-The function returns a pointer which holds the state of calculation.
+If you prefer dynamic allocation, please refer to functions below.
+*/
 
-This pointer must be provided as "void* state" parameter for XXH32_feed().
-XXH32_feed() can be called as many times as necessary.
-The function returns an error code, with 0 meaning OK, and all other values meaning there is an error.
-Note that "len" is type "int", which means it is limited to 2^31-1. 
-If your data is larger, it is recommended
-to chunk your data into blocks of size 2^30 (1GB) to avoid any "int" overflow issue.
+XXH32_state_t* XXH32_createState(void);
+XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
 
-Finally, you can end the calculation anytime, by using XXH32_result().
-This function returns the final 32-bits hash.
-You must provide the same "void* state" parameter created by XXH32_init().
+XXH64_state_t* XXH64_createState(void);
+XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
 
-Memory will be freed by XXH32_result().
+/*
+These functions create and release memory for XXH state.
+States must then be initialized using XXHnn_reset() before first use.
 */
 
 
-unsigned int XXH32_getIntermediateResult (void* state);
+XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
+XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
+
 /*
-This function does the same as XXH32_result(), generating a 32-bit hash,
-but preserve memory context.
-This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_feed().
-To free memory context, use XXH32_result().
-*/
+These functions calculate the xxHash of an input provided in multiple smaller packets,
+as opposed to an input provided as a single block.
 
+XXH state space must first be allocated, using either static or dynamic method provided above.
+
+Start a new hash by initializing state with a seed, using XXHnn_reset().
+
+Then, feed the hash state by calling XXHnn_update() as many times as necessary.
+Obviously, input must be valid, meaning allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, you can produce a hash anytime, by using XXHnn_digest().
+This function returns the final nn-bits hash.
+You can nonetheless continue feeding the hash state with more input,
+and therefore get some new hashes, by calling again XXHnn_digest().
+
+When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
+*/
 
 
 #if defined (__cplusplus)
 }
-#endif
\ No newline at end of file
+#endif

From d2ed2c63829df6bd102d94025b9265cce3c65d10 Mon Sep 17 00:00:00 2001
From: Nir Heimann <nir@gobeepit.com>
Date: Mon, 21 Sep 2015 11:51:16 +0300
Subject: [PATCH 11/21] Adding more information to the README.md file

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 6375612..a76362d 100644
--- a/README.md
+++ b/README.md
@@ -5,12 +5,24 @@ PHP extension to add support for the [xxhash - r42](https://github.com/Cyan4973/
 
 ## How To Install
 
+Run
 ```
    phpize
    ./configure --enable-xxhash
    make
    sudo make install
+```  
+
+Add to the php.ini
+```  
+    extension=xxhash.so
+```
+
+Restart apache
+```
+    sudo service apache2 restart  
 ```
+  
 
 ## How To Use
 

From cdf2c7c9a5db92643ddb4eba48376573aa7e6aef Mon Sep 17 00:00:00 2001
From: Jerry Martinez <32611988+jmwebservices@users.noreply.github.com>
Date: Thu, 15 Feb 2018 08:43:13 -0500
Subject: [PATCH 12/21] Remove comments to fix configure.js

Comments prevent buildconf from correctly generating configure.js.
---
 config.w32 | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/config.w32 b/config.w32
index d2a2d46..f62e013 100644
--- a/config.w32
+++ b/config.w32
@@ -1,11 +1,7 @@
 // $Id$
 // vim:ft=javascript
 
-// If your extension references something external, use ARG_WITH
-// ARG_WITH("xxhash", "for xxhash support", "no");
-
-// Otherwise, use ARG_ENABLE
-// ARG_ENABLE("xxhash", "enable xxhash support", "no");
+ARG_ENABLE("xxhash", "enable xxhash support", "no");
 
 if (PHP_XXHASH != "no") {
 	EXTENSION("xxhash", "php_xxhash.c");

From a05c6e64db8bab942f480c80cafe70a95e3b62f5 Mon Sep 17 00:00:00 2001
From: Tawkir Ahmed Fakir <tawkir@eskimi.com>
Date: Fri, 27 Sep 2019 13:55:11 +0200
Subject: [PATCH 13/21] Remove long to string conversion

---
 php_xxhash.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/php_xxhash.c b/php_xxhash.c
index e7a8c08..c0f8c78 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -38,12 +38,9 @@ PHP_FUNCTION(xxhash32)
 
 	// compute the checksum
 	sum = XXH32(arg, arg_len, 0);
-
-	//convert to a hex string
-	strg = strpprintf(0, "%08x", sum);
-
-	// return the checksum
-	RETURN_STR(strg);
+	
+	/* return the checksum */
+	RETURN_LONG(sum);
 }
 
 PHP_FUNCTION(xxhash64)
@@ -60,11 +57,9 @@ PHP_FUNCTION(xxhash64)
 	// compute the checksum
 	sum = XXH64(arg, arg_len, 0);
 
-	//convert to a hex string
-	strg = strpprintf(0, "%08x%08x", (U32)(sum >> 32), (U32)sum);
-
-	// return the checksum
-	RETURN_STR(strg);
+	/* return the checksum */
+	/* Negative values can be returned since we cannot return  unsigned long to php */
+	RETURN_LONG(sum);
 }
 
 const zend_function_entry xxhash_functions[] = {

From ae1114bf5710dc662abc0f562a4016a95324ff31 Mon Sep 17 00:00:00 2001
From: Tawkir Ahmed Fakir <tawkir@eskimi.com>
Date: Fri, 27 Sep 2019 14:03:54 +0200
Subject: [PATCH 14/21] Update c files

---
 xxhash.c | 1573 ++++++++++++++++++++++++++++++------------------------
 xxhash.h |  508 +++++++++++++++---
 2 files changed, 1318 insertions(+), 763 deletions(-)

diff --git a/xxhash.c b/xxhash.c
index e6fb8f1..12cfd58 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -1,154 +1,256 @@
 /*
-xxHash - Fast Hash algorithm
-Copyright (C) 2012-2015, Yann Collet
-
-BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-* Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You can contact the author at :
-- xxHash source repository : https://github.com/Cyan4973/xxHash
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 
-/**************************************
+/* *************************************
 *  Tuning parameters
-**************************************/
-/* Unaligned memory access is automatically enabled for "common" CPU, such as x86.
- * For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
- * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
- * You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
  */
-#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#  define XXH_USE_UNALIGNED_ACCESS 1
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+                        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
+                        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+                    || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+                    || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
 #endif
 
-/* XXH_ACCEPT_NULL_INPUT_POINTER :
- * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
- * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
- * By default, this option is disabled. To enable it, uncomment below define :
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
  */
-/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
-
-/* XXH_FORCE_NATIVE_FORMAT :
- * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
- * Results are therefore identical for little-endian and big-endian CPU.
- * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
- * Should endian-independance be of no importance for your application, you may set the #define below to 1.
- * It will improve speed for Big-endian CPU.
- * This option has no impact on Little_Endian CPU.
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
  */
-#define XXH_FORCE_NATIVE_FORMAT 0
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang. */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*! Modify the local functions below should you wish to use some other memory routines
+*   for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#include <limits.h>   /* ULLONG_MAX */
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash.h"
 
 
-/**************************************
+/* *************************************
 *  Compiler Specific Options
 ***************************************/
 #ifdef _MSC_VER    /* Visual Studio */
 #  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
-#  define FORCE_INLINE static __forceinline
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
 #else
-#  if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
 #    ifdef __GNUC__
-#      define FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
 #    else
-#      define FORCE_INLINE static inline
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
 #    endif
 #  else
-#    define FORCE_INLINE static
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
 #  endif /* __STDC_VERSION__ */
 #endif
 
 
-/**************************************
-*  Includes & Memory related functions
+
+/* *************************************
+*  Debug
 ***************************************/
-#include "xxhash.h"
-/* Modify the local functions below should you wish to use some other memory routines */
-/* for malloc(), free() */
-#include <stdlib.h>
-static void* XXH_malloc(size_t s) { return malloc(s); }
-static void  XXH_free  (void* p)  { free(p); }
-/* for memcpy() */
-#include <string.h>
-static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
 
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note : can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
 
-/**************************************
+/* note : use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  { enum { XXH_sa = 1/(int)(!!(c)) }; }
+
+
+/* *************************************
 *  Basic Types
 ***************************************/
-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
-# include <stdint.h>
-  typedef uint8_t  BYTE;
-  typedef uint16_t U16;
-  typedef uint32_t U32;
-  typedef  int32_t S32;
-  typedef uint64_t U64;
-#else
-  typedef unsigned char      BYTE;
-  typedef unsigned short     U16;
-  typedef unsigned int       U32;
-  typedef   signed int       S32;
-  typedef unsigned long long U64;
+#ifndef MEM_MODULE
+# if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint8_t  BYTE;
+    typedef uint16_t U16;
+    typedef uint32_t U32;
+# else
+    typedef unsigned char      BYTE;
+    typedef unsigned short     U16;
+    typedef unsigned int       U32;
+# endif
 #endif
 
+
+/* ===   Memory access   === */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; } __attribute__((packed)) unalign;
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
 static U32 XXH_read32(const void* memPtr)
 {
-    U32 val32;
-    memcpy(&val32, memPtr, 4);
-    return val32;
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
 }
 
-static U64 XXH_read64(const void* memPtr)
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ===   Endianess   === */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+static int XXH_isLittleEndian(void)
 {
-    U64 val64;
-    memcpy(&val64, memPtr, 8);
-    return val64;
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
 }
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#endif
+
 
 
 
-/******************************************
+/* ****************************************
 *  Compiler-specific Functions and Macros
 ******************************************/
-#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
 
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
 /* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
-#if defined(_MSC_VER)
+#elif defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
 #  define XXH_rotl64(x,r) _rotl64(x,r)
 #else
-#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
-#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
 #endif
 
 #if defined(_MSC_VER)     /* Visual Studio */
 #  define XXH_swap32 _byteswap_ulong
-#  define XXH_swap64 _byteswap_uint64
-#elif GCC_VERSION >= 403
+#elif XXH_GCC_VERSION >= 403
 #  define XXH_swap32 __builtin_bswap32
-#  define XXH_swap64 __builtin_bswap64
 #else
 static U32 XXH_swap32 (U32 x)
 {
@@ -157,165 +259,227 @@ static U32 XXH_swap32 (U32 x)
             ((x >>  8) & 0x0000ff00 ) |
             ((x >> 24) & 0x000000ff );
 }
-static U64 XXH_swap64 (U64 x)
-{
-    return  ((x << 56) & 0xff00000000000000ULL) |
-            ((x << 40) & 0x00ff000000000000ULL) |
-            ((x << 24) & 0x0000ff0000000000ULL) |
-            ((x << 8)  & 0x000000ff00000000ULL) |
-            ((x >> 8)  & 0x00000000ff000000ULL) |
-            ((x >> 24) & 0x0000000000ff0000ULL) |
-            ((x >> 40) & 0x000000000000ff00ULL) |
-            ((x >> 56) & 0x00000000000000ffULL);
-}
 #endif
 
 
-/***************************************
-*  Architecture Macros
-***************************************/
-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
-#ifndef XXH_CPU_LITTLE_ENDIAN   /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example using a compiler switch */
-static const int one = 1;
-#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
-#endif
-
-
-/*****************************
+/* ***************************
 *  Memory reads
 *****************************/
 typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
 
-FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+XXH_FORCE_INLINE U32 XXH_readLE32(const void* ptr)
 {
-    if (align==XXH_unaligned)
-        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
-    else
-        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
 }
 
-FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+static U32 XXH_readBE32(const void* ptr)
 {
-    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
 }
 
-FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+XXH_FORCE_INLINE U32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
 {
-    if (align==XXH_unaligned)
-        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
-    else
-        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
-}
-
-FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
-{
-    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+    }
 }
 
 
-/***************************************
-*  Macros
+/* *************************************
+*  Misc
 ***************************************/
-#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    /* use only *after* variable declarations */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
 
 
-/***************************************
-*  Constants
-***************************************/
-#define PRIME32_1   2654435761U
-#define PRIME32_2   2246822519U
-#define PRIME32_3   3266489917U
-#define PRIME32_4    668265263U
-#define PRIME32_5    374761393U
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const U32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const U32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const U32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const U32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const U32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static U32 XXH32_round(U32 acc, U32 input)
+{
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /* UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32 loop
+     * (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!)
+     *   making it slightly slower to multiply four integers at once compared to four
+     *   integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is
+     *   still not worth it to go into SSE just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because the
+     *   SIMD actually serializes this operation: While v1 is rotating, v2 can load data,
+     *   while v3 can multiply. SSE forces them to operate together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize. */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
+}
+
+/* mix all bits */
+static U32 XXH32_avalanche(U32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
 
-#define PRIME64_1 11400714785074694791ULL
-#define PRIME64_2 14029467366897019727ULL
-#define PRIME64_3  1609587929392839161ULL
-#define PRIME64_4  9650029242287828579ULL
-#define PRIME64_5  2870177450012600261ULL
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
 
+static U32
+XXH32_finalize(U32 h32, const void* ptr, size_t len, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1               \
+    h32 += (*p++) * PRIME32_5; \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4                         \
+    h32 += XXH_get32bits(p) * PRIME32_3; \
+    p+=4;                                \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
 
-/*****************************
-*  Simple Hash Functions
-*****************************/
-FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+XXH_FORCE_INLINE U32
+XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* bEnd = p + len;
     U32 h32;
-#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (p==NULL)
-    {
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (p==NULL) {
         len=0;
         bEnd=p=(const BYTE*)(size_t)16;
     }
 #endif
 
-    if (len>=16)
-    {
-        const BYTE* const limit = bEnd - 16;
+    if (len>=16) {
+        const BYTE* const limit = bEnd - 15;
         U32 v1 = seed + PRIME32_1 + PRIME32_2;
         U32 v2 = seed + PRIME32_2;
         U32 v3 = seed + 0;
         U32 v4 = seed - PRIME32_1;
 
-        do
-        {
-            v1 += XXH_get32bits(p) * PRIME32_2;
-            v1 = XXH_rotl32(v1, 13);
-            v1 *= PRIME32_1;
-            p+=4;
-            v2 += XXH_get32bits(p) * PRIME32_2;
-            v2 = XXH_rotl32(v2, 13);
-            v2 *= PRIME32_1;
-            p+=4;
-            v3 += XXH_get32bits(p) * PRIME32_2;
-            v3 = XXH_rotl32(v3, 13);
-            v3 *= PRIME32_1;
-            p+=4;
-            v4 += XXH_get32bits(p) * PRIME32_2;
-            v4 = XXH_rotl32(v4, 13);
-            v4 *= PRIME32_1;
-            p+=4;
-        }
-        while (p<=limit);
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
+            v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
+            v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
+            v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
+        } while (p < limit);
 
-        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-    }
-    else
-    {
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
         h32  = seed + PRIME32_5;
     }
 
-    h32 += (U32) len;
+    h32 += (U32)len;
 
-    while (p+4<=bEnd)
-    {
-        h32 += XXH_get32bits(p) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
-        p+=4;
-    }
-
-    while (p<bEnd)
-    {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-        p++;
-    }
-
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-
-    return h32;
+    return XXH32_finalize(h32, p, len&15, align);
 }
 
 
-unsigned XXH32 (const void* input, size_t len, unsigned seed)
+XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
 {
 #if 0
     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
@@ -323,593 +487,632 @@ unsigned XXH32 (const void* input, size_t len, unsigned seed)
     XXH32_reset(&state, seed);
     XXH32_update(&state, input, len);
     return XXH32_digest(&state);
+
 #else
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-#  if !defined(XXH_USE_UNALIGNED_ACCESS)
-    if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
-    {
-        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
-        else
-            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
-    }
-#  endif
 
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
-    else
-        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align(input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align(input, len, seed, XXH_unaligned);
 #endif
 }
 
-FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+
+
+/*======   Hash streaming   ======*/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
 {
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* bEnd = p + len;
-    U64 h64;
-#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (p==NULL)
-    {
-        len=0;
-        bEnd=p=(const BYTE*)(size_t)32;
-    }
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
 #endif
 
-    if (len>=32)
-    {
-        const BYTE* const limit = bEnd - 32;
-        U64 v1 = seed + PRIME64_1 + PRIME64_2;
-        U64 v2 = seed + PRIME64_2;
-        U64 v3 = seed + 0;
-        U64 v4 = seed - PRIME64_1;
+    {   const BYTE* p = (const BYTE*)input;
+        const BYTE* const bEnd = p + len;
 
-        do
-        {
-            v1 += XXH_get64bits(p) * PRIME64_2;
-            p+=8;
-            v1 = XXH_rotl64(v1, 31);
-            v1 *= PRIME64_1;
-            v2 += XXH_get64bits(p) * PRIME64_2;
-            p+=8;
-            v2 = XXH_rotl64(v2, 31);
-            v2 *= PRIME64_1;
-            v3 += XXH_get64bits(p) * PRIME64_2;
-            p+=8;
-            v3 = XXH_rotl64(v3, 31);
-            v3 *= PRIME64_1;
-            v4 += XXH_get64bits(p) * PRIME64_2;
-            p+=8;
-            v4 = XXH_rotl64(v4, 31);
-            v4 *= PRIME64_1;
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
         }
-        while (p<=limit);
 
-        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const U32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
 
-        v1 *= PRIME64_2;
-        v1 = XXH_rotl64(v1, 31);
-        v1 *= PRIME64_1;
-        h64 ^= v1;
-        h64 = h64 * PRIME64_1 + PRIME64_4;
-
-        v2 *= PRIME64_2;
-        v2 = XXH_rotl64(v2, 31);
-        v2 *= PRIME64_1;
-        h64 ^= v2;
-        h64 = h64 * PRIME64_1 + PRIME64_4;
-
-        v3 *= PRIME64_2;
-        v3 = XXH_rotl64(v3, 31);
-        v3 *= PRIME64_1;
-        h64 ^= v3;
-        h64 = h64 * PRIME64_1 + PRIME64_4;
-
-        v4 *= PRIME64_2;
-        v4 = XXH_rotl64(v4, 31);
-        v4 *= PRIME64_1;
-        h64 ^= v4;
-        h64 = h64 * PRIME64_1 + PRIME64_4;
-    }
-    else
-    {
-        h64  = seed + PRIME64_5;
+        if (p <= bEnd-16) {
+            const BYTE* const limit = bEnd - 16;
+            U32 v1 = state->v1;
+            U32 v2 = state->v2;
+            U32 v3 = state->v3;
+            U32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
     }
 
-    h64 += (U64) len;
+    return XXH_OK;
+}
 
-    while (p+8<=bEnd)
-    {
-        U64 k1 = XXH_get64bits(p);
-        k1 *= PRIME64_2;
-        k1 = XXH_rotl64(k1,31);
-        k1 *= PRIME64_1;
-        h64 ^= k1;
-        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
-        p+=8;
-    }
 
-    if (p+4<=bEnd)
-    {
-        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
-        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-        p+=4;
-    }
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state)
+{
+    U32 h32;
 
-    while (p<bEnd)
-    {
-        h64 ^= (*p) * PRIME64_5;
-        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-        p++;
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
     }
 
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
+    h32 += state->total_len_32;
 
-    return h64;
+    return XXH32_finalize(h32, state->mem32, state->memsize, XXH_aligned);
 }
 
 
-unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+/*======   Canonical representation   ======*/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
 {
-#if 0
-    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
-    XXH64_state_t state;
-    XXH64_reset(&state, seed);
-    XXH64_update(&state, input, len);
-    return XXH64_digest(&state);
-#else
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-#  if !defined(XXH_USE_UNALIGNED_ACCESS)
-    if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
-    {
-        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-            return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
-        else
-            return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
-    }
-#  endif
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
 
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
-    else
-        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
-#endif
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
 }
 
-/****************************************************
-*  Advanced Hash Functions
-****************************************************/
 
-/*** Allocation ***/
-typedef struct
-{
-    U64 total_len;
-    U32 seed;
-    U32 v1;
-    U32 v2;
-    U32 v3;
-    U32 v4;
-    U32 mem32[4];   /* defined as U32 for alignment */
-    U32 memsize;
-} XXH_istate32_t;
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
 
-typedef struct
+/*======   Memory access   ======*/
+
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t U64;
+# else
+    /* if compiler doesn't support unsigned long long, replace by another 64-bit type */
+    typedef unsigned long long U64;
+# endif
+#endif
+
+/*! XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a performance gain
+ * on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit registers,
+ * and 64-bit arithmetic needs to be simulated, it isn't beneficial to unroll. The code becomes
+ * ridiculously large (the largest function in the binary on i386!), and rerolling it saves
+ * anywhere from 3kB to 20kB. It is also slightly faster because it fits into cache better
+ * and is more likely to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
+static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static U64 XXH_read64(const void* memPtr)
 {
-    U64 total_len;
-    U64 seed;
-    U64 v1;
-    U64 v2;
-    U64 v3;
-    U64 v4;
-    U64 mem64[4];   /* defined as U64 for alignment */
-    U32 memsize;
-} XXH_istate64_t;
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
 
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
-XXH32_state_t* XXH32_createState(void)
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U64 XXH_swap64 (U64 x)
 {
-    XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   /* A compilation error here means XXH32_state_t is not large enough */
-    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
 }
-XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+#endif
+
+XXH_FORCE_INLINE U64 XXH_readLE64(const void* ptr)
 {
-    XXH_free(statePtr);
-    return XXH_OK;
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
 }
 
-XXH64_state_t* XXH64_createState(void)
+static U64 XXH_readBE64(const void* ptr)
 {
-    XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   /* A compilation error here means XXH64_state_t is not large enough */
-    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
 }
-XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+
+XXH_FORCE_INLINE U64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
 {
-    XXH_free(statePtr);
-    return XXH_OK;
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
 }
 
 
-/*** Hash feed ***/
+/*======   xxh64   ======*/
 
-XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed)
+static const U64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const U64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const U64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const U64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const U64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static U64 XXH64_round(U64 acc, U64 input)
 {
-    XXH_istate32_t* state = (XXH_istate32_t*) state_in;
-    state->seed = seed;
-    state->v1 = seed + PRIME32_1 + PRIME32_2;
-    state->v2 = seed + PRIME32_2;
-    state->v3 = seed + 0;
-    state->v4 = seed - PRIME32_1;
-    state->total_len = 0;
-    state->memsize = 0;
-    return XXH_OK;
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
 }
 
-XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
+static U64 XXH64_mergeRound(U64 acc, U64 val)
 {
-    XXH_istate64_t* state = (XXH_istate64_t*) state_in;
-    state->seed = seed;
-    state->v1 = seed + PRIME64_1 + PRIME64_2;
-    state->v2 = seed + PRIME64_2;
-    state->v3 = seed + 0;
-    state->v4 = seed - PRIME64_1;
-    state->total_len = 0;
-    state->memsize = 0;
-    return XXH_OK;
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
 }
 
-
-FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+static U64 XXH64_avalanche(U64 h64)
 {
-    XXH_istate32_t* state = (XXH_istate32_t *) state_in;
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (input==NULL) return XXH_ERROR;
-#endif
 
-    state->total_len += len;
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
 
-    if (state->memsize + len < 16)   /* fill in tmp buffer */
-    {
-        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
-        state->memsize += (U32)len;
-        return XXH_OK;
-    }
+static U64
+XXH64_finalize(U64 h64, const void* ptr, size_t len, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1_64            \
+    h64 ^= (*p++) * PRIME64_5; \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64          \
+    h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
+    p+=4;                    \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 {        \
+    U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \
+    p+=8;                    \
+    h64 ^= k1;               \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
 
-    if (state->memsize)   /* some data left from previous update */
-    {
-        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
-        {
-            const U32* p32 = state->mem32;
-            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
-            state->v1 = XXH_rotl32(state->v1, 13);
-            state->v1 *= PRIME32_1;
-            p32++;
-            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
-            state->v2 = XXH_rotl32(state->v2, 13);
-            state->v2 *= PRIME32_1;
-            p32++;
-            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
-            state->v3 = XXH_rotl32(state->v3, 13);
-            state->v3 *= PRIME32_1;
-            p32++;
-            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
-            state->v4 = XXH_rotl32(state->v4, 13);
-            state->v4 *= PRIME32_1;
-            p32++;
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
         }
-        p += 16-state->memsize;
-        state->memsize = 0;
-    }
-
-    if (p <= bEnd-16)
-    {
-        const BYTE* const limit = bEnd - 16;
-        U32 v1 = state->v1;
-        U32 v2 = state->v2;
-        U32 v3 = state->v3;
-        U32 v4 = state->v4;
-
-        do
-        {
-            v1 += XXH_readLE32(p, endian) * PRIME32_2;
-            v1 = XXH_rotl32(v1, 13);
-            v1 *= PRIME32_1;
-            p+=4;
-            v2 += XXH_readLE32(p, endian) * PRIME32_2;
-            v2 = XXH_rotl32(v2, 13);
-            v2 *= PRIME32_1;
-            p+=4;
-            v3 += XXH_readLE32(p, endian) * PRIME32_2;
-            v3 = XXH_rotl32(v3, 13);
-            v3 *= PRIME32_1;
-            p+=4;
-            v4 += XXH_readLE32(p, endian) * PRIME32_2;
-            v4 = XXH_rotl32(v4, 13);
-            v4 *= PRIME32_1;
-            p+=4;
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
         }
-        while (p<=limit);
-
-        state->v1 = v1;
-        state->v2 = v2;
-        state->v3 = v3;
-        state->v4 = v4;
-    }
-
-    if (p < bEnd)
-    {
-        XXH_memcpy(state->mem32, p, bEnd-p);
-        state->memsize = (int)(bEnd-p);
     }
-
-    return XXH_OK;
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
 }
 
-XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+XXH_FORCE_INLINE U64
+XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_alignment align)
 {
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U64 h64;
 
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
-    else
-        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
-}
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
 
+    if (len>=32) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
 
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
+        } while (p<=limit);
 
-FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
-{
-    const XXH_istate32_t* state = (const XXH_istate32_t*) state_in;
-    const BYTE * p = (const BYTE*)state->mem32;
-    const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize;
-    U32 h32;
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
 
-    if (state->total_len >= 16)
-    {
-        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
-    }
-    else
-    {
-        h32  = state->seed + PRIME32_5;
+    } else {
+        h64  = seed + PRIME64_5;
     }
 
-    h32 += (U32) state->total_len;
+    h64 += (U64) len;
 
-    while (p+4<=bEnd)
-    {
-        h32 += XXH_readLE32(p, endian) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
-        p+=4;
-    }
+    return XXH64_finalize(h64, p, len, align);
+}
 
-    while (p<bEnd)
-    {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
-        p++;
-    }
 
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
 
-    return h32;
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align(input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align(input, len, seed, XXH_unaligned);
+
+#endif
 }
 
+/*======   Hash Streaming   ======*/
 
-U32 XXH32_digest (const XXH32_state_t* state_in)
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
 {
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_digest_endian(state_in, XXH_littleEndian);
-    else
-        return XXH32_digest_endian(state_in, XXH_bigEndian);
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
 }
 
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
 
-FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
 {
-    XXH_istate64_t * state = (XXH_istate64_t *) state_in;
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (input==NULL) return XXH_ERROR;
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
 #endif
 
-    state->total_len += len;
+    {   const BYTE* p = (const BYTE*)input;
+        const BYTE* const bEnd = p + len;
 
-    if (state->memsize + len < 32)   /* fill in tmp buffer */
-    {
-        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
-        state->memsize += (U32)len;
-        return XXH_OK;
-    }
+        state->total_len += len;
 
-    if (state->memsize)   /* some data left from previous update */
-    {
-        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
-        {
-            const U64* p64 = state->mem64;
-            state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
-            state->v1 = XXH_rotl64(state->v1, 31);
-            state->v1 *= PRIME64_1;
-            p64++;
-            state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
-            state->v2 = XXH_rotl64(state->v2, 31);
-            state->v2 *= PRIME64_1;
-            p64++;
-            state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
-            state->v3 = XXH_rotl64(state->v3, 31);
-            state->v3 *= PRIME64_1;
-            p64++;
-            state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
-            state->v4 = XXH_rotl64(state->v4, 31);
-            state->v4 *= PRIME64_1;
-            p64++;
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+            state->memsize += (U32)len;
+            return XXH_OK;
         }
-        p += 32-state->memsize;
-        state->memsize = 0;
-    }
 
-    if (p+32 <= bEnd)
-    {
-        const BYTE* const limit = bEnd - 32;
-        U64 v1 = state->v1;
-        U64 v2 = state->v2;
-        U64 v3 = state->v3;
-        U64 v4 = state->v4;
-
-        do
-        {
-            v1 += XXH_readLE64(p, endian) * PRIME64_2;
-            v1 = XXH_rotl64(v1, 31);
-            v1 *= PRIME64_1;
-            p+=8;
-            v2 += XXH_readLE64(p, endian) * PRIME64_2;
-            v2 = XXH_rotl64(v2, 31);
-            v2 *= PRIME64_1;
-            p+=8;
-            v3 += XXH_readLE64(p, endian) * PRIME64_2;
-            v3 = XXH_rotl64(v3, 31);
-            v3 *= PRIME64_1;
-            p+=8;
-            v4 += XXH_readLE64(p, endian) * PRIME64_2;
-            v4 = XXH_rotl64(v4, 31);
-            v4 *= PRIME64_1;
-            p+=8;
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
         }
-        while (p<=limit);
 
-        state->v1 = v1;
-        state->v2 = v2;
-        state->v3 = v3;
-        state->v4 = v4;
-    }
+        if (p+32 <= bEnd) {
+            const BYTE* const limit = bEnd - 32;
+            U64 v1 = state->v1;
+            U64 v2 = state->v2;
+            U64 v3 = state->v3;
+            U64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
 
-    if (p < bEnd)
-    {
-        XXH_memcpy(state->mem64, p, bEnd-p);
-        state->memsize = (int)(bEnd-p);
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
     }
 
     return XXH_OK;
 }
 
-XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
-    else
-        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
-}
-
 
-
-FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
 {
-    const XXH_istate64_t * state = (const XXH_istate64_t *) state_in;
-    const BYTE * p = (const BYTE*)state->mem64;
-    const BYTE* bEnd = (const BYTE*)state->mem64 + state->memsize;
     U64 h64;
 
-    if (state->total_len >= 32)
-    {
-        U64 v1 = state->v1;
-        U64 v2 = state->v2;
-        U64 v3 = state->v3;
-        U64 v4 = state->v4;
+    if (state->total_len >= 32) {
+        U64 const v1 = state->v1;
+        U64 const v2 = state->v2;
+        U64 const v3 = state->v3;
+        U64 const v4 = state->v4;
 
         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
-
-        v1 *= PRIME64_2;
-        v1 = XXH_rotl64(v1, 31);
-        v1 *= PRIME64_1;
-        h64 ^= v1;
-        h64 = h64*PRIME64_1 + PRIME64_4;
-
-        v2 *= PRIME64_2;
-        v2 = XXH_rotl64(v2, 31);
-        v2 *= PRIME64_1;
-        h64 ^= v2;
-        h64 = h64*PRIME64_1 + PRIME64_4;
-
-        v3 *= PRIME64_2;
-        v3 = XXH_rotl64(v3, 31);
-        v3 *= PRIME64_1;
-        h64 ^= v3;
-        h64 = h64*PRIME64_1 + PRIME64_4;
-
-        v4 *= PRIME64_2;
-        v4 = XXH_rotl64(v4, 31);
-        v4 *= PRIME64_1;
-        h64 ^= v4;
-        h64 = h64*PRIME64_1 + PRIME64_4;
-    }
-    else
-    {
-        h64  = state->seed + PRIME64_5;
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
     }
 
     h64 += (U64) state->total_len;
 
-    while (p+8<=bEnd)
-    {
-        U64 k1 = XXH_readLE64(p, endian);
-        k1 *= PRIME64_2;
-        k1 = XXH_rotl64(k1,31);
-        k1 *= PRIME64_1;
-        h64 ^= k1;
-        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
-        p+=8;
-    }
+    return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, XXH_aligned);
+}
 
-    if (p+4<=bEnd)
-    {
-        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
-        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-        p+=4;
-    }
 
-    while (p<bEnd)
-    {
-        h64 ^= (*p) * PRIME64_5;
-        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-        p++;
-    }
+/*====== Canonical representation   ======*/
 
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
 
-    return h64;
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
 }
 
 
-unsigned long long XXH64_digest (const XXH64_state_t* state_in)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_digest_endian(state_in, XXH_littleEndian);
-    else
-        return XXH64_digest_endian(state_in, XXH_bigEndian);
-}
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3.h"
 
 
+#endif  /* XXH_NO_LONG_LONG */
diff --git a/xxhash.h b/xxhash.h
index c60aa61..7533eb2 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -1,7 +1,7 @@
 /*
    xxHash - Extremely Fast Hash algorithm
    Header File
-   Copyright (C) 2012-2015, Yann Collet.
+   Copyright (C) 2012-2016, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -57,136 +57,488 @@ Q.Score is a measure of quality of the hash function.
 It depends on successfully passing SMHasher test set.
 10 is a perfect score.
 
-A 64-bits version, named XXH64, is available since r35.
-It offers much better speed, but for 64-bits applications only.
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
 Name     Speed on 64 bits    Speed on 32 bits
 XXH64       13.8 GB/s            1.9 GB/s
 XXH32        6.8 GB/s            6.0 GB/s
 */
 
-#pragma once
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
 
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
 
-/*****************************
+/* ****************************
 *  Definitions
-*****************************/
+******************************/
 #include <stddef.h>   /* size_t */
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
-/*****************************
-*  Namespace Emulation
-*****************************/
-/* Motivations :
-
-If you need to include xxHash into your library,
-but wish to avoid xxHash symbols to be present on your library interface
-in an effort to avoid potential name collision if another library also includes xxHash,
-
-you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
-with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
-
-Note that no change is required within the calling program :
-it can still call xxHash functions using their regular name.
-They will be automatically translated by this header.
-*/
+/* ****************************
+ *  API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ *  This build macro includes xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining offers great performance improvement on small keys,
+ *  and dramatic ones when length is expressed as a compile-time constant.
+ *  See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html .
+ *  Methodology :
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate object.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+#else
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
 #ifdef XXH_NAMESPACE
 #  define XXH_CAT(A,B) A##B
 #  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
 #  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
-#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
 #  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
-#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
 #  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
-#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
 #  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
-#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
 #  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
-#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
 #  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
 #  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  1
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+    typedef unsigned int XXH32_hash_t;
 #endif
 
+/*! XXH32() :
+    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
 
-/*****************************
-*  Simple Hash Functions
-*****************************/
+/*======   Streaming   ======*/
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
 
-unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
-unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
 
 /*
-XXH32() :
-    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
-    The memory between input & input+length must be valid (allocated and read-accessible).
+ * Streaming functions generate the xxHash of an input provided in multiple segments.
+ * Note that, for small input, they are slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hashes later on, by calling again XXH*_digest().
+ *
+ * When done, free XXH state space if it was allocated dynamically.
+ */
+
+/*======   Canonical representation   ======*/
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+ * The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+ * These functions allow transformation of hash result into and from its canonical format.
+ * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+ */
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXH64_hash_t;
+#else
+    typedef unsigned long long XXH64_hash_t;
+#endif
+
+/*! XXH64() :
+    Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
     "seed" can be used to alter the result predictably.
-    This function successfully passes all SMHasher tests.
-    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-XXH64() :
-    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
-    Faster on 64-bits systems. Slower on 32-bits systems.
+    This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
 */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*======   Streaming   ======*/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*======   Canonical representation   ======*/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+
+#ifdef XXH_STATIC_LINKING_ONLY
+
+/* ================================================================================================
+   This section contains declarations which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
+=================================================================================================== */
+
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+#ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved[2];     /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+#endif   /* XXH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+#ifndef XXH_NO_LONG_LONG
+
+
+/* ============================================
+ * XXH3 is a new hash algorithm,
+ * featuring improved speed performance for both small and large inputs.
+ * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ * In general, expect XXH3 to run about ~2x faster on large inputs,
+ * and >3x faster on small ones, though exact differences depend on platform.
+ *
+ * The algorithm is portable, will generate the same hash on all platforms.
+ * It benefits greatly from vectorization units, but does not require it.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant :
+ * it reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The XXH3 algorithm is still considered experimental.
+ * Produced results can still change between versions.
+ * For example, results produced by v0.7.1 are not comparable with results from v0.7.0 .
+ * It's nonetheless possible to use XXH3 for ephemeral data (local sessions),
+ * but avoid storing values in long-term storage for later re-use.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ *
+ * There are still a number of opened questions that community can influence during the experimental period.
+ * I'm trying to list a few of them below, though don't consider this list as complete.
+ *
+ * - 128-bits output type : currently defined as a structure of two 64-bits fields.
+ *                          That's because 128-bit values do not exist in C standard.
+ *                          Note that it means that, at byte level, result is not identical depending on endianess.
+ *                          However, at field level, they are identical on all platforms.
+ *                          The canonical representation solves the issue of identical byte-level representation across platforms,
+ *                          which is necessary for serialization.
+ *                          Would there be a better representation for a 128-bit hash result ?
+ *                          Are the names of the inner 64-bit fields important ? Should they be changed ?
+ *
+ * - Seed type for 128-bits variant : currently, it's a single 64-bit value, like the 64-bit variant.
+ *                          It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
+ *                          But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value.
+ *                          Such a variant could either replace current one, or become an additional one.
+ *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ *                          If both 64-bit and 128-bit seeds are possible, which variant should be called XXH128 ?
+ *
+ * - Result for len==0 : Currently, the result of hashing a zero-length input is `0`.
+ *                          It seems okay as a return value when using all "default" secret and seed (it used to be a request for XXH32/XXH64).
+ *                          But is it still fine to return `0` when secret or seed are non-default ?
+ *                          Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ?
+ */
 
+#ifdef XXH_NAMESPACE
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#endif
 
+/* XXH3_64bits() :
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/* XXH3_64bits_withSecret() :
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid repeating same character, or sequences of bytes,
+ * and especially avoid swathes of \0.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/* XXH3_64bits_withSeed() :
+ * This variant generates on the fly a custom secret,
+ * based on the default secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * note : seed==0 produces same results as XXH3_64bits() */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
 
-/*****************************
-*  Advanced Hash Functions
-*****************************/
-typedef struct { long long ll[ 6]; } XXH32_state_t;
-typedef struct { long long ll[11]; } XXH64_state_t;
+typedef struct XXH3_state_s XXH3_state_t;
+
+#define XXH3_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+#define XXH3_INTERNALBUFFER_SIZE 256
+struct XXH3_state_s {
+   XXH_ALIGN(64) XXH64_hash_t acc[8];
+   XXH_ALIGN(64) char customSecret[XXH3_SECRET_DEFAULT_SIZE];  /* used to store a custom secret generated from the seed. Makes state larger. Design might change */
+   XXH_ALIGN(64) char buffer[XXH3_INTERNALBUFFER_SIZE];
+   const void* secret;
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH32_hash_t secretLimit;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+};   /* typedef'd to XXH3_state_t */
+
+/* Streaming requires state maintenance.
+ * This operation costs memory and cpu.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer using one-shot functions whenever possible. */
+
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+
+/* XXH3_64bits_reset() :
+ * initialize with default parameters.
+ * result will be equivalent to `XXH3_64bits()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/* XXH3_64bits_reset_withSeed() :
+ * generate a custom secret from `seed`, and store it into state.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/* XXH3_64bits_reset_withSecret() :
+ * `secret` is referenced, and must outlive the hash streaming session.
+ * secretSize must be >= XXH3_SECRET_SIZE_MIN.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+
+/* 128-bit */
 
-/*
-These structures allow static allocation of XXH states.
-States must then be initialized using XXHnn_reset() before first use.
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
 
-If you prefer dynamic allocation, please refer to functions below.
-*/
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
 
-XXH32_state_t* XXH32_createState(void);
-XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
 
-XXH64_state_t* XXH64_createState(void);
-XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
 
-/*
-These functions create and release memory for XXH state.
-States must then be initialized using XXHnn_reset() before first use.
-*/
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
 
 
-XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
-XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
-unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
+/* Note : for better performance, following functions should be inlined,
+ * using XXH_INLINE_ALL */
 
-XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
-XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
-unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
 
-/*
-These functions calculate the xxHash of an input provided in multiple smaller packets,
-as opposed to an input provided as a single block.
+/* This comparator is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
 
-XXH state space must first be allocated, using either static or dynamic method provided above.
 
-Start a new hash by initializing state with a seed, using XXHnn_reset().
+/*======   Canonical representation   ======*/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
 
-Then, feed the hash state by calling XXHnn_update() as many times as necessary.
-Obviously, input must be valid, meaning allocated and read accessible.
-The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
 
-Finally, you can produce a hash anytime, by using XXHnn_digest().
-This function returns the final nn-bits hash.
-You can nonetheless continue feeding the hash state with more input,
-and therefore get some new hashes, by calling again XXHnn_digest().
+#endif  /* XXH_NO_LONG_LONG */
 
-When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
-*/
+
+/*-**********************************************************************
+*  XXH_INLINE_ALL
+************************************************************************/
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  include "xxhash.c"   /* include xxhash function bodies as `static`, for inlining */
+#endif
+
+
+
+#endif /* XXH_STATIC_LINKING_ONLY */
 
 
 #if defined (__cplusplus)
 }
 #endif
+
+#endif /* XXHASH_H_5627135585666179 */

From afc944cebf3cc91468eaed26ccc809db1adde2bb Mon Sep 17 00:00:00 2001
From: Tawkir Ahmed Fakir <tawkir@eskimi.com>
Date: Fri, 27 Sep 2019 16:59:37 +0200
Subject: [PATCH 15/21] Update readme

---
 README.md | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 425a3b1..e274fb4 100644
--- a/README.md
+++ b/README.md
@@ -20,16 +20,37 @@ Please note at the moment the master branch should be used for PHP 5.x and devel
 ```
 Don't forget to load the extension in via php.ini or the like.
 
+To use with docker compose
+
+```
+
+RUN curl -fsSL 'https://github.com/tawkirahmed/php-xxhash/archive/develop.zip' -o xxhash.zip \
+    && mkdir -p xxhash \
+    && unzip -j xxhash.zip -d xxhash\
+    && rm xxhash.zip \
+    && ( \
+        cd xxhash \
+        && phpize \
+        && ./configure --enable-xxhash \
+        && make -j "$(nproc)" \
+        && make install \
+    ) \
+    && rm -r xxhash \
+    && docker-php-ext-enable xxhash
+
+```
+
 ## Usage Instructions
 
 Upon installation and enabling the extension within php.ini the following two new functions will be available to you:
 
 ```
-string xxhash32(string $data);
-string xxhash64(string $data);
+long xxhash32(string $data);
+long xxhash64(string $data);
 ```
 
-In both cases a string will be returned, representing the digest (hash) of the $data input.
+In both cases a long number will be returned, representing the digest (hash) of the $data input.
+Beware that in case of ```xxhash64``` negative number can be returned since php does not support unsigned long.
 
 
 ## Credits

From d511a40e8db8c9de16e5d0accbb8c3d1d0b4d7e4 Mon Sep 17 00:00:00 2001
From: Tawkir Ahmed Fakir <tawkir@eskimi.com>
Date: Wed, 2 Oct 2019 10:59:38 +0200
Subject: [PATCH 16/21] Add missing header file

---
 xxh3.h | 1618 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1618 insertions(+)
 create mode 100644 xxh3.h

diff --git a/xxh3.h b/xxh3.h
new file mode 100644
index 0000000..bde3c45
--- /dev/null
+++ b/xxh3.h
@@ -0,0 +1,1618 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Development source file for `xxh3`
+   Copyright (C) 2019-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Note :
+   This file is separated for development purposes.
+   It will be integrated into `xxhash.c` when development phase is complete.
+*/
+
+#ifndef XXH3_H
+#define XXH3_H
+
+
+/* ===   Dependencies   === */
+
+#undef XXH_INLINE_ALL   /* in case it's already defined */
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * Sanity check.
+ *
+ * XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *
+ * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
+ * classic 16-bit only subset of ARM's instruction set.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we
+ * will give a warning.
+ *
+ * Usually, if this happens, it is because of an accident and you probably
+ * need to specify -march, as you probably meant to compileh for a newer
+ * architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0
+#define XXH_SSE2   1
+#define XXH_AVX2   2
+#define XXH_NEON   3
+#define XXH_VSX    4
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && defined(__LITTLE_ENDIAN__) /* ARM big endian is a thing */
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* control alignment of accumulator,
+ * for compatibility with fast vector loads */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == 0   /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == 1  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == 2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == 3  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == 4  /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  endif
+#endif
+
+/* U64 XXH_mult32to64(U32 a, U64 b) { return (U64)a * (U64)b; } */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu(x, y)
+#else
+#    define XXH_mult32to64(x, y) ((U64)((x) & 0xFFFFFFFF) * (U64)((y) & 0xFFFFFFFF))
+#endif
+
+/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
+ * there is a lot of mischief with endianness. */
+#if XXH_VECTOR == XXH_VSX
+#  include <altivec.h>
+#  undef vector
+typedef __vector unsigned long long U64x2;
+typedef __vector unsigned char U8x16;
+typedef __vector unsigned U32x4;
+
+#ifndef XXH_VSX_BE
+#  ifdef __BIG_ENDIAN__
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+#endif
+
+/* We need some helpers for big endian mode. */
+#if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  ifdef __POWER9_VECTOR__
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
+{
+    U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                              0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+
+/* Power8 Crypto gives us vpermxor which is very handy for
+ * PPC64EB.
+ *
+ * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
+ * {
+ *     U8x16 ret;
+ *     for (int i = 0; i < 16; i++) {
+ *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
+ *     }
+ *     return ret;
+ * }
+ *
+ * Because both of the main loops load the key, swap, and xor it with data,
+ * we can combine the key swap into this instruction.
+ */
+#  ifdef vec_permxor
+#    define XXH_vec_permxor vec_permxor
+#  else
+#    define XXH_vec_permxor __builtin_crypto_vpermxor
+#  endif
+#endif
+/*
+ * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
+ * vec_mule.
+ *
+ * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
+ * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
+ * GCC needs inline assembly. */
+#if __has_builtin(__builtin_altivec_vmuleuw)
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+#else
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+#endif
+#endif
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+XXH_ALIGN(64) static const BYTE kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * GCC for x86 has a tendency to use SSE in this loop. While it
+ * successfully avoids swapping (as MUL overwrites EAX and EDX), it
+ * slows it down because instead of free register swap shifts, it
+ * must use pshufd and punpckl/hd.
+ *
+ * To prevent this, we use this attribute to shut off SSE.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static XXH128_hash_t
+XXH_mult64to128(U64 lhs, U64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this
+     * type despite not having the arithmetic for it. This results in a
+     * laggy compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t const r128 = { (U64)(product), (U64)(product >> 64) };
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * U64 _umul128(U64 Multiplier, U64 Multiplicand, U64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    U64 product_high;
+    U64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t const r128 = { product_low, product_high };
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown
+     * below with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
+     *     ---------
+     *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
+     *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
+     *     ---------
+     *       6 9 7 5
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for
+     *     UINT64_MAX. This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARMv6+ A32/T32, which is shown below:
+     *
+     *         void UMAAL(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm)
+     *         {
+     *             U64 product = (U64)*RdLo * (U64)*RdHi + Rn + Rm;
+     *             *RdLo = (U32)(product & 0xFFFFFFFF);
+     *             *RdHi = (U32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication,
+     *     and allows this to be calculated in only 4 instructions which
+     *     is comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be
+     *     a couple of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    U64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    U64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    U64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    U64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    U64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    U64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    U64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128 = { lower, upper };
+    return r128;
+#endif
+}
+
+/*
+ * We want to keep the attribute here because a target switch
+ * disables inlining.
+ *
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ * The reason for the separate function is to prevent passing
+ * too many structs around by value. This will hopefully inline
+ * the multiply, but we don't force it.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static U64
+XXH3_mul128_fold64(U64 lhs, U64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+
+static XXH64_hash_t XXH3_avalanche(U64 h64)
+{
+    h64 ^= h64 >> 37;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const BYTE* data, size_t len, const BYTE* keyPtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(data != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(keyPtr != NULL);
+    {   BYTE const c1 = data[0];
+        BYTE const c2 = data[len >> 1];
+        BYTE const c3 = data[len - 1];
+        U32  const combined = ((U32)c1) + (((U32)c2) << 8) + (((U32)c3) << 16) + (((U32)len) << 24);
+        U64  const keyed = (U64)combined ^ (XXH_readLE32(keyPtr) + seed);
+        U64  const mixed = keyed * PRIME64_1;
+        return XXH3_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const BYTE* data, size_t len, const BYTE* keyPtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(data != NULL);
+    XXH_ASSERT(keyPtr != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    {   U32 const in1 = XXH_readLE32(data);
+        U32 const in2 = XXH_readLE32(data + len - 4);
+        U64 const in64 = in1 + ((U64)in2 << 32);
+        U64 const keyed = in64 ^ (XXH_readLE64(keyPtr) + seed);
+        U64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
+        return XXH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const BYTE* data, size_t len, const BYTE* keyPtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(data != NULL);
+    XXH_ASSERT(keyPtr != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   U64 const ll1 = XXH_readLE64(data) ^ (XXH_readLE64(keyPtr) + seed);
+        U64 const ll2 = XXH_readLE64(data + len - 8) ^ (XXH_readLE64(keyPtr + 8) - seed);
+        U64 const acc = len + (ll1 + ll2) + XXH3_mul128_fold64(ll1, ll2);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const BYTE* data, size_t len, const BYTE* keyPtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_64b(data, len, keyPtr, seed);
+        if (len >= 4) return XXH3_len_4to8_64b(data, len, keyPtr, seed);
+        if (len) return XXH3_len_1to3_64b(data, len, keyPtr, seed);
+        return 0;
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(U64))
+
+typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512(      void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT data,
+                    const void* XXH_RESTRICT key,
+                    XXH3_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
+        const         __m256i* const xdata = (const __m256i *) data;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+        const         __m256i* const xkey  = (const __m256i *) key;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i const d   = _mm256_loadu_si256 (xdata+i);
+            __m256i const k   = _mm256_loadu_si256 (xkey+i);
+            __m256i const dk  = _mm256_xor_si256 (d,k);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m256i const mul = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXH3_acc_128bits) {
+                __m256i const dswap = _mm256_shuffle_epi32(d, _MM_SHUFFLE(1,0,3,2));
+                __m256i const add = _mm256_add_epi64(xacc[i], dswap);
+                xacc[i]  = _mm256_add_epi64(mul, add);
+            } else {  /* XXH3_acc_64bits */
+                __m256i const add = _mm256_add_epi64(xacc[i], d);
+                xacc[i]  = _mm256_add_epi64(mul, add);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;   /* presumed */
+        const         __m128i* const xdata = (const __m128i *) data;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+        const         __m128i* const xkey  = (const __m128i *) key;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i const d   = _mm_loadu_si128 (xdata+i);
+            __m128i const k   = _mm_loadu_si128 (xkey+i);
+            __m128i const dk  = _mm_xor_si128 (d,k);                                 /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+            __m128i const mul = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31));    /* uint64 mul[2] = {dk0*dk1,dk2*dk3} */
+            if (accWidth == XXH3_acc_128bits) {
+                __m128i const dswap = _mm_shuffle_epi32(d, _MM_SHUFFLE(1,0,3,2));
+                __m128i const add = _mm_add_epi64(xacc[i], dswap);
+                xacc[i]  = _mm_add_epi64(mul, add);
+            } else {  /* XXH3_acc_64bits */
+                __m128i const add = _mm_add_epi64(xacc[i], d);
+                xacc[i]  = _mm_add_epi64(mul, add);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xdata = (const uint8_t *) data;
+        uint8_t const* const xkey  = (const uint8_t *) key;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
+            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
+             * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
+             * assumes I don't want to destroy it and tries to make a copy. This slows down the code
+             * a lot.
+             * aarch64 not only uses an entirely different syntax, but it requires three
+             * instructions...
+             *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
+             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
+             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
+             * ...to do what ARM does in one:
+             *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
+
+            /* data_vec = xdata[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xdata + (i * 16));
+            /* key_vec  = xkey[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xkey  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t       data_key;
+
+            if (accWidth == XXH3_acc_64bits) {
+                /* Add first to prevent register swaps */
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                /* can probably be optimized better */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+
+            data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
+
+            /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
+             * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
+            __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
+            /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
+
+#else
+            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
+
+            /* data_vec = xdata[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xdata + (i * 16));
+            /* key_vec  = xkey[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xkey  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
+            uint32x2_t const data_key_lo = vmovn_u64  (data_key);
+            /* data_key_hi = (uint32x2_t) (data_key >> 32); */
+            uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
+            if (accWidth == XXH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+#endif
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
+    U64x2 const* const xdata = (U64x2 const*) data;   /* no alignment restriction */
+    U64x2 const* const xkey  = (U64x2 const*) key;    /* no alignment restriction */
+    U64x2 const v32 = { 32,  32 };
+#if XXH_VSX_BE
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        /* data_vec = xdata[i]; */
+        /* key_vec = xkey[i]; */
+#if XXH_VSX_BE
+        /* byteswap */
+        U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xdata + i));
+        U64x2 const key_raw = vec_vsx_ld(0, xkey + i);
+        /* See comment above. data_key = data_vec ^ swap(xkey[i]); */
+        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const data_vec = vec_vsx_ld(0, xdata + i);
+        U64x2 const key_vec = vec_vsx_ld(0, xkey + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
+        /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
+        U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3_acc_128bits */
+            /* swap high and low halves */
+            U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) U64* const xacc = (U64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const BYTE* const xdata = (const BYTE*) data;  /* no alignment restriction */
+    const BYTE* const xkey  = (const BYTE*) key;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i+=2) {
+        U64 const in1 = XXH_readLE64(xdata + 8*i);
+        U64 const in2 = XXH_readLE64(xdata + 8*(i+1));
+        U64 const key1  = XXH_readLE64(xkey + 8*i);
+        U64 const key2  = XXH_readLE64(xkey + 8*(i+1));
+        U64 const data_key1 = key1 ^ in1;
+        U64 const data_key2 = key2 ^ in2;
+        xacc[i]   += XXH_mult32to64(data_key1 & 0xFFFFFFFF, data_key1 >> 32);
+        xacc[i+1] += XXH_mult32to64(data_key2 & 0xFFFFFFFF, data_key2 >> 32);
+        if (accWidth == XXH3_acc_128bits) {
+            xacc[i]   += in2;
+            xacc[i+1] += in1;
+        } else {  /* XXH3_acc_64bits */
+            xacc[i]   += in1;
+            xacc[i+1] += in2;
+        }
+    }
+#endif
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT key)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        const         __m256i* const xkey = (const __m256i *) key;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i data = xacc[i];
+            __m256i const shifted = _mm256_srli_epi64(data, 47);
+            data = _mm256_xor_si256(data, shifted);
+
+            {   __m256i const k   = _mm256_loadu_si256 (xkey+i);
+                __m256i const dk  = _mm256_xor_si256   (data, k);
+
+                __m256i const dk1 = _mm256_mul_epu32 (dk, prime32);
+
+                __m256i const d2  = _mm256_shuffle_epi32 (dk, 0x31);
+                __m256i const dk2 = _mm256_mul_epu32 (d2, prime32);
+                __m256i const dk2h= _mm256_slli_epi64 (dk2, 32);
+
+                xacc[i] = _mm256_add_epi64(dk1, dk2h);
+        }   }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        const         __m128i* const xkey = (const __m128i *) key;   /* not really aligned, just for ptr arithmetic */
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i data = xacc[i];
+            __m128i const shifted = _mm_srli_epi64(data, 47);
+            data = _mm_xor_si128(data, shifted);
+
+            {   __m128i const k   = _mm_loadu_si128 (xkey+i);
+                __m128i const dk  = _mm_xor_si128   (data,k);
+
+                __m128i const dk1 = _mm_mul_epu32 (dk, prime32);
+
+                __m128i const d2  = _mm_shuffle_epi32 (dk, 0x31);
+                __m128i const dk2 = _mm_mul_epu32 (d2, prime32);
+                __m128i const dk2h= _mm_slli_epi64(dk2, 32);
+
+                xacc[i] = _mm_add_epi64(dk1, dk2h);
+        }   }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
+        uint8_t const* const xkey = (uint8_t const*) key;
+        uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
+            uint64x2_t const   acc_vec  = xacc[i];
+            uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* key_vec  = xkey[i]; */
+            uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xkey + (i * 16)));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
+            /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
+            uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
+
+            /* data_key *= PRIME32_1 */
+
+            /* prod_hi = (data_key >> 32) * PRIME32_1; */
+            uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
+            /* xacc[i] = prod_hi << 32; */
+            xacc[i] = vshlq_n_u64(prod_hi, 32);
+            /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+            xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+          U64x2* const xacc =       (U64x2*) acc;
+    const U64x2* const xkey = (const U64x2*) key;
+    /* constants */
+    U64x2 const v32  = { 32, 32 };
+    U64x2 const v47 = { 47, 47 };
+    U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+    size_t i;
+#if XXH_VSX_BE
+    /* endian swap */
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        U64x2 const acc_vec  = xacc[i];
+        U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+        /* key_vec = xkey[i]; */
+#if XXH_VSX_BE
+        /* swap bytes words */
+        U64x2 const key_raw  = vec_vsx_ld(0, xkey + i);
+	U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const key_vec  = vec_vsx_ld(0, xkey + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+
+        /* data_key *= PRIME32_1 */
+
+        /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
+        U64x2 const prod_even  = XXH_vec_mule((U32x4)data_key, prime);
+        /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
+        U64x2 const prod_odd  = XXH_vec_mulo((U32x4)data_key, prime);
+        xacc[i] = prod_odd + (prod_even << v32);
+    }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) U64* const xacc = (U64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const BYTE* const xkey = (const BYTE*) key;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+
+    for (i=0; i < ACC_NB; i++) {
+        U64 const key64 = XXH_readLE64(xkey + 8*i);
+        U64 acc64 = xacc[i];
+        acc64 ^= acc64 >> 47;
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+/* assumption : nbStripes will not overflow secret size */
+XXH_FORCE_INLINE void
+XXH3_accumulate(       U64* XXH_RESTRICT acc,
+                const BYTE* XXH_RESTRICT data,
+                const BYTE* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        XXH3_accumulate_512(acc,
+                            data   + n*STRIPE_LEN,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
+ *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
+ *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
+ *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
+ */
+#if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__)
+static void
+#else
+XXH_FORCE_INLINE void
+#endif
+XXH3_hashLong_internal_loop( U64* XXH_RESTRICT acc,
+                      const BYTE* XXH_RESTRICT data, size_t len,
+                      const BYTE* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, data + n*block_len, secret, nb_rounds, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, data + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const BYTE* const p = data + len - STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
+            XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXH_FORCE_INLINE U64
+XXH3_mix2Accs(const U64* XXH_RESTRICT acc, const BYTE* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const U64* XXH_RESTRICT acc, const BYTE* XXH_RESTRICT secret, U64 start)
+{
+    U64 result64 = start;
+
+    result64 += XXH3_mix2Accs(acc+0, secret +  0);
+    result64 += XXH3_mix2Accs(acc+2, secret + 16);
+    result64 += XXH3_mix2Accs(acc+4, secret + 32);
+    result64 += XXH3_mix2Accs(acc+6, secret + 48);
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_internal(const BYTE* XXH_RESTRICT data, size_t len,
+                       const BYTE* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) U64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, data, len, secret, secretSize, XXH3_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+#define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (U64)len * PRIME64_1);
+}
+
+
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_64b_defaultSecret(const BYTE* XXH_RESTRICT data, size_t len)
+{
+    return XXH3_hashLong_internal(data, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_64b_withSecret(const BYTE* XXH_RESTRICT data, size_t len,
+                             const BYTE* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3_hashLong_internal(data, len, secret, secretSize);
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, U64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3_initKeySeed() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3_initKeySeed(BYTE* customSecret, U64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+    for (i=0; i < nbRounds; i++) {
+        XXH_writeLE64(customSecret + 16*i,     XXH_readLE64(kSecret + 16*i)     + seed64);
+        XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
+    }
+}
+
+
+/* XXH3_hashLong_64b_withSeed() :
+ * Generate a custom key,
+ * based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ */
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_64b_withSeed(const BYTE* data, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) BYTE secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3_hashLong_64b_defaultSecret(data, len);
+    XXH3_initKeySeed(secret, seed);
+    return XXH3_hashLong_internal(data, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE U64 XXH3_mix16B(const BYTE* XXH_RESTRICT data,
+                                 const BYTE* XXH_RESTRICT key, U64 seed64)
+{
+    U64 const ll1 = XXH_readLE64(data);
+    U64 const ll2 = XXH_readLE64(data+8);
+    return XXH3_mul128_fold64(
+               ll1 ^ (XXH_readLE64(key)   + seed64),
+               ll2 ^ (XXH_readLE64(key+8) - seed64) );
+}
+
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const BYTE* XXH_RESTRICT data, size_t len,
+                     const BYTE* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   U64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(data+48, secret+96, seed);
+                    acc += XXH3_mix16B(data+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(data+32, secret+64, seed);
+                acc += XXH3_mix16B(data+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(data+16, secret+32, seed);
+            acc += XXH3_mix16B(data+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(data+0, secret+0, seed);
+        acc += XXH3_mix16B(data+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const BYTE* XXH_RESTRICT data, size_t len,
+                      const BYTE* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   U64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(data+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(data+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(data + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len)
+{
+    if (len <= 16) return XXH3_len_0to16_64b((const BYTE*)data, len, kSecret, 0);
+    if (len <= 128) return XXH3_len_17to128_64b((const BYTE*)data, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const BYTE*)data, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_64b_defaultSecret((const BYTE*)data, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+     if (len <= 16) return XXH3_len_0to16_64b((const BYTE*)data, len, (const BYTE*)secret, 0);
+     if (len <= 128) return XXH3_len_17to128_64b((const BYTE*)data, len, (const BYTE*)secret, secretSize, 0);
+     if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const BYTE*)data, len, (const BYTE*)secret, secretSize, 0);
+     return XXH3_hashLong_64b_withSecret((const BYTE*)data, len, (const BYTE*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3_len_0to16_64b((const BYTE*)data, len, kSecret, seed);
+    if (len <= 128) return XXH3_len_17to128_64b((const BYTE*)data, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const BYTE*)data, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_64b_withSeed((const BYTE*)data, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    return (XXH3_state_t*)XXH_malloc(sizeof(XXH3_state_t));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const BYTE* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, (const BYTE*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initKeySeed(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3_consumeStripes( U64* acc,
+                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                    const BYTE* data, size_t totalStripes,
+                    const BYTE* secret, size_t secretLimit,
+                    XXH3_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3_accumulate(acc, data, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretLimit);
+        XXH3_accumulate(acc, data + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3_accumulate(acc, data, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state, const BYTE* input, size_t len, XXH3_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const BYTE* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* input now > XXH3_INTERNALBUFFER_SIZE */
+
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        if (state->bufferedSize) {   /* some data within internal buffer: fill then consume it */
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* consume input by full buffer quantities */
+        if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
+            const BYTE* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* some remaining input data : buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const BYTE*)input, len, XXH3_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
+{
+    memcpy(acc, state->acc, sizeof(state->acc));  /* digest locally, state remains unaltered, and can continue ingesting more data afterwards */
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            BYTE lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+    }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_64bits);
+        return XXH3_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (U64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (=> XXH128)
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const BYTE* data, size_t len, const BYTE* keyPtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(data != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(keyPtr != NULL);
+    {   BYTE const c1 = data[0];
+        BYTE const c2 = data[len >> 1];
+        BYTE const c3 = data[len - 1];
+        U32  const combinedl = ((U32)c1) + (((U32)c2) << 8) + (((U32)c3) << 16) + (((U32)len) << 24);
+        U32  const combinedh = XXH_swap32(combinedl);
+        U64  const keyedl = (U64)combinedl ^ (XXH_readLE32(keyPtr)   + seed);
+        U64  const keyedh = (U64)combinedh ^ (XXH_readLE32(keyPtr+4) - seed);
+        U64  const mixedl = keyedl * PRIME64_1;
+        U64  const mixedh = keyedh * PRIME64_5;
+        XXH128_hash_t const h128 = { XXH3_avalanche(mixedl) /*low64*/, XXH3_avalanche(mixedh) /*high64*/ };
+        return h128;
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const BYTE* data, size_t len, const BYTE* keyPtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(data != NULL);
+    XXH_ASSERT(keyPtr != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    {   U32 const in1 = XXH_readLE32(data);
+        U32 const in2 = XXH_readLE32(data + len - 4);
+        U64 const in64l = in1 + ((U64)in2 << 32);
+        U64 const in64h = XXH_swap64(in64l);
+        U64 const keyedl = in64l ^ (XXH_readLE64(keyPtr) + seed);
+        U64 const keyedh = in64h ^ (XXH_readLE64(keyPtr + 8) - seed);
+        U64 const mix64l1 = len + ((keyedl ^ (keyedl >> 51)) * PRIME32_1);
+        U64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
+        U64 const mix64h1 = ((keyedh ^ (keyedh >> 47)) * PRIME64_1) - len;
+        U64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
+        {   XXH128_hash_t const h128 = { XXH3_avalanche(mix64l2) /*low64*/, XXH3_avalanche(mix64h2) /*high64*/ };
+            return h128;
+    }   }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const BYTE* data, size_t len, const BYTE* keyPtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(data != NULL);
+    XXH_ASSERT(keyPtr != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   U64 const ll1 = XXH_readLE64(data) ^ (XXH_readLE64(keyPtr) + seed);
+        U64 const ll2 = XXH_readLE64(data + len - 8) ^ (XXH_readLE64(keyPtr+8) - seed);
+        U64 const inlow = ll1 ^ ll2;
+        XXH128_hash_t m128 = XXH_mult64to128(inlow, PRIME64_1);
+        U64 const lenContrib = (U64)(U32)len * (U64)PRIME32_5; m128.low64 += lenContrib;
+        m128.high64 += ll2 * PRIME64_1;
+        m128.low64  ^= (m128.high64 >> 32);
+        {   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/* Assumption : `secret` size is >= 16
+ * Note : it should be >= XXH3_SECRET_SIZE_MIN anyway */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const BYTE* data, size_t len, const BYTE* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(data, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(data, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(data, len, secret, seed);
+        {   XXH128_hash_t const h128 = { 0, 0 };
+            return h128;
+    }   }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const BYTE* XXH_RESTRICT data, size_t len,
+                            const BYTE* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) U64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, data, len, secret, secretSize, XXH3_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   U64 const low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (U64)len * PRIME64_1);
+        U64 const high64 = XXH3_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((U64)len * PRIME64_2));
+        XXH128_hash_t const h128 = { low64, high64 };
+        return h128;
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_128b_defaultSecret(const BYTE* data, size_t len)
+{
+    return XXH3_hashLong_128b_internal(data, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_128b_withSecret(const BYTE* data, size_t len,
+                              const BYTE* secret, size_t secretSize)
+{
+    return XXH3_hashLong_128b_internal(data, len, secret, secretSize);
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_128b_withSeed(const BYTE* data, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) BYTE secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3_hashLong_128b_defaultSecret(data, len);
+    XXH3_initKeySeed(secret, seed);
+    return XXH3_hashLong_128b_internal(data, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const BYTE* p1, const BYTE* p2, const BYTE* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (p1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(p2) + XXH_readLE64(p2+8);
+    acc.high64 += XXH3_mix16B (p2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(p1) + XXH_readLE64(p1+8);
+    return acc;
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const BYTE* XXH_RESTRICT data, size_t len,
+                       const BYTE* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc, data+(32*i), data+(32*i)+16, secret+(32*i), seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc, data+(32*i), data+(32*i)+16, secret+XXH3_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc, data + len - 16, data + len - 32, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
+
+        {   U64 const low64 = acc.low64 + acc.high64;
+            U64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
+            return h128;
+        }
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const BYTE* XXH_RESTRICT data, size_t len,
+                      const BYTE* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, data+48, data+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, data+32, data+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, data+16, data+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, data, data+len-16, secret, seed);
+        {   U64 const low64 = acc.low64 + acc.high64;
+            U64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
+            return h128;
+        }
+    }
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len)
+{
+    if (len <= 16) return XXH3_len_0to16_128b((const BYTE*)data, len, kSecret, 0);
+    if (len <= 128) return XXH3_len_17to128_128b((const BYTE*)data, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const BYTE*)data, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_128b_defaultSecret((const BYTE*)data, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+     if (len <= 16) return XXH3_len_0to16_128b((const BYTE*)data, len, (const BYTE*)secret, 0);
+     if (len <= 128) return XXH3_len_17to128_128b((const BYTE*)data, len, (const BYTE*)secret, secretSize, 0);
+     if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const BYTE*)data, len, (const BYTE*)secret, secretSize, 0);
+     return XXH3_hashLong_128b_withSecret((const BYTE*)data, len, (const BYTE*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3_len_0to16_128b((const BYTE*)data, len, kSecret, seed);
+    if (len <= 128) return XXH3_len_17to128_128b((const BYTE*)data, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const BYTE*)data, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_128b_withSeed((const BYTE*)data, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* data, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(data, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/* all the functions are actually the same as for 64-bit streaming variant,
+   just the reset one is different (different initial acc values for 0,5,6,7),
+   and near the end of the digest function */
+
+static void
+XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const BYTE* secret, size_t secretSize)
+{
+    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, (const BYTE*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initKeySeed(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const BYTE*)input, len, XXH3_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   U64 const low64 = XXH3_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (U64)state->totalLen * PRIME64_1);
+            U64 const high64 = XXH3_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((U64)state->totalLen * PRIME64_2));
+            XXH128_hash_t const h128 = { low64, high64 };
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+#endif  /* XXH3_H */

From 45b9f7b72ee66b99768e1940fcd5ad8e5db8b1bb Mon Sep 17 00:00:00 2001
From: Tawkir Ahmed Fakir <tawkir@eskimi.com>
Date: Wed, 2 Oct 2019 12:02:21 +0200
Subject: [PATCH 17/21] Update installation instructions

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index e274fb4..008d9f7 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,8 @@ Please note at the moment the master branch should be used for PHP 5.x and devel
 ```
 Don't forget to load the extension in via php.ini or the like.
 
+More details can be found here: https://www.sitepoint.com/install-php-extensions-source/
+
 To use with docker compose
 
 ```

From 313c0847f2d6a27b6a9e6466b4da5647220ff4c8 Mon Sep 17 00:00:00 2001
From: Nir Heimann <nir@gobeepit.com>
Date: Thu, 10 Dec 2020 13:14:02 +0200
Subject: [PATCH 18/21] Support for PHP 7. Zend RETURN_STRING function was
 changed

---
 php_xxhash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/php_xxhash.c b/php_xxhash.c
index 1525981..11edbfa 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -102,7 +102,7 @@ PHP_FUNCTION(xxhash64Unsigned)
 	//Since php doesn't have unsinged long values, the value will be returned as string
 	char numberAsAString[20];
 	sprintf(numberAsAString, "%llu",sum);
-	RETURN_STRING(numberAsAString, 1);
+	RETURN_STRING(numberAsAString);
 }
 zend_function_entry xxhash_functions[] = {
 	PHP_FE(xxhash32, NULL)

From 21f9b8e69ed1103b17eb707d1ee247a0e60749eb Mon Sep 17 00:00:00 2001
From: Nir Heimann <nir@gobeepit.com>
Date: Sun, 20 Dec 2020 13:01:54 +0200
Subject: [PATCH 19/21] Start from scratch

---
 README.md    |  34 --
 config.m4    |  24 --
 config.w32   |  13 -
 php_xxhash.c | 126 -------
 php_xxhash.h |  30 --
 xxhash.c     | 962 ---------------------------------------------------
 xxhash.h     | 192 ----------
 7 files changed, 1381 deletions(-)
 delete mode 100644 README.md
 delete mode 100644 config.m4
 delete mode 100644 config.w32
 delete mode 100644 php_xxhash.c
 delete mode 100644 php_xxhash.h
 delete mode 100644 xxhash.c
 delete mode 100644 xxhash.h

diff --git a/README.md b/README.md
deleted file mode 100644
index 6375612..0000000
--- a/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# php-xxhash
-
-PHP extension to add support for the [xxhash - r42](https://github.com/Cyan4973/xxHash) extremely fast hashing algorithm.  _xxhash_ is designed to be fast enough to use in real-time streaming applications.
-
-
-## How To Install
-
-```
-   phpize
-   ./configure --enable-xxhash
-   make
-   sudo make install
-```
-
-## How To Use
-
-This extension adds three new PHP functions:
-
-```
-    // 32 bit version (all values are positive)
-    int xxhash32(string $data);
-    
-    // 64 bit version (can return negative values since PHP doesn't support unsigned long values)
-    long xxhash64(string $data);
-    
-    // 64 bit version (all values are positive but returned as strings)
-    string xxhash64Unsigned(string $data);
-```
-
-They will checksum the string, and return the checksum.
-
-## License
-
-BSD 2-clause license.
\ No newline at end of file
diff --git a/config.m4 b/config.m4
deleted file mode 100644
index 862a221..0000000
--- a/config.m4
+++ /dev/null
@@ -1,24 +0,0 @@
-dnl $Id$
-dnl config.m4 for extension xxhash
-
-dnl Comments in this file start with the string 'dnl'.
-dnl Remove where necessary. This file will not work
-dnl without editing.
-
-dnl If your extension references something external, use with:
-
-dnl PHP_ARG_WITH(xxhash, for xxhash support,
-dnl Make sure that the comment is aligned:
-dnl [  --with-xxhash             Include xxhash support])
-
-dnl Otherwise use enable:
-
-PHP_ARG_ENABLE(xxhash, whether to enable xxhash support,
-dnl Make sure that the comment is aligned:
-[  --enable-xxhash            Enable xxhash support])
-
-if test "$PHP_XXHASH" != "no"; then
-  dnl Write more examples of tests here...
-
-  PHP_NEW_EXTENSION(xxhash, php_xxhash.c, $ext_shared)
-fi
diff --git a/config.w32 b/config.w32
deleted file mode 100644
index d2a2d46..0000000
--- a/config.w32
+++ /dev/null
@@ -1,13 +0,0 @@
-// $Id$
-// vim:ft=javascript
-
-// If your extension references something external, use ARG_WITH
-// ARG_WITH("xxhash", "for xxhash support", "no");
-
-// Otherwise, use ARG_ENABLE
-// ARG_ENABLE("xxhash", "enable xxhash support", "no");
-
-if (PHP_XXHASH != "no") {
-	EXTENSION("xxhash", "php_xxhash.c");
-}
-
diff --git a/php_xxhash.c b/php_xxhash.c
deleted file mode 100644
index 11edbfa..0000000
--- a/php_xxhash.c
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "php.h"
-#include "php_ini.h"
-#include "ext/standard/info.h"
-#include "php_xxhash.h"
-
-#include "xxhash.c"
-
-#ifdef COMPILE_DL_XXHASH
-ZEND_GET_MODULE(xxhash)
-#endif
-
-PHP_MINIT_FUNCTION(xxhash)
-{
-	return SUCCESS;
-}
-
-PHP_MSHUTDOWN_FUNCTION(xxhash)
-{
-	return SUCCESS;
-}
-
-PHP_RINIT_FUNCTION(xxhash)
-{
-	return SUCCESS;
-}
-
-PHP_RSHUTDOWN_FUNCTION(xxhash)
-{
-	return SUCCESS;
-}
-
-PHP_MINFO_FUNCTION(xxhash)
-{
-	php_info_print_table_start();
-	php_info_print_table_header(2, "xxhash support", "enabled");
-	php_info_print_table_row(2, "extension version", PHP_XXHASH_VERSION);
-	php_info_print_table_row(2, "xxhash release", "http://code.google.com/p/xxhash/source/detail?r=6");
-	php_info_print_table_end();
-}
-
-PHP_FUNCTION(xxhash32)
-{
-	char *arg1 = NULL;
-	char *ret1 = NULL;
-	int arg1_len;
-	unsigned int sum;
-
-	/* parse the parameters */
-	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
-	{
-		RETURN_NULL();
-	}
-
-	/* compute the checksum */
-	sum = XXH32(arg1, arg1_len, 0);
-
-	/* return the checksum */
-	RETURN_LONG((long)sum);
-}
-
-PHP_FUNCTION(xxhash64)
-{
-	char *arg1 = NULL;
-	char *ret1 = NULL;
-	int arg1_len;
-	unsigned long long sum;
-
-	/* parse the parameters */
-	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
-	{
-		RETURN_NULL();
-	}
-
-	/* compute the checksum */
-	sum = XXH64(arg1, arg1_len, 0);
-
-	/* return the checksum */
-	/* Negative values can be returned since we cannot return  unsigned long to php */
-	RETURN_LONG(sum); 
-}
-
-PHP_FUNCTION(xxhash64Unsigned)
-{
-	char *arg1 = NULL;
-	char *ret1 = NULL;
-	int arg1_len;
-	unsigned long long sum;
-
-	/* parse the parameters */
-	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &arg1, &arg1_len) == FAILURE || arg1_len < 1)
-	{
-		RETURN_NULL();
-	}
-
-	/* compute the checksum */
-	sum = XXH64(arg1, arg1_len, 0);
-	
-	//Since php doesn't have unsinged long values, the value will be returned as string
-	char numberAsAString[20];
-	sprintf(numberAsAString, "%llu",sum);
-	RETURN_STRING(numberAsAString);
-}
-zend_function_entry xxhash_functions[] = {
-	PHP_FE(xxhash32, NULL)
-	PHP_FE(xxhash64, NULL)
-	PHP_FE(xxhash64Unsigned, NULL)
-	{NULL, NULL, NULL}
-};
-
-zend_module_entry xxhash_module_entry = {
-	STANDARD_MODULE_HEADER,
-	"xxhash",
-	xxhash_functions,
-	PHP_MINIT(xxhash),
-	PHP_MSHUTDOWN(xxhash),
-	NULL,
-	NULL,
-	PHP_MINFO(xxhash),
-	PHP_XXHASH_VERSION,
-	STANDARD_MODULE_PROPERTIES
-};
-
diff --git a/php_xxhash.h b/php_xxhash.h
deleted file mode 100644
index c71241d..0000000
--- a/php_xxhash.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef PHP_XXHASH_H
-#define PHP_XXHASH_H
-
-#define PHP_XXHASH_VERSION "1.0.1"
-
-extern zend_module_entry xxhash_module_entry;
-#define phpext_xxhash_ptr &xxhash_module_entry
-
-#if defined(PHP_WIN32) && defined(XXHASH_EXPORTS)
-#define PHP_XXHASH_API __declspec(dllexport)
-#else
-#define PHP_XXHASH_API PHPAPI
-#endif
-
-#ifdef ZTS
-#include "TSRM.h"
-#endif
-
-PHP_MINIT_FUNCTION(xxhash);
-PHP_MSHUTDOWN_FUNCTION(xxhash);
-PHP_RINIT_FUNCTION(xxhash);
-PHP_RSHUTDOWN_FUNCTION(xxhash);
-PHP_MINFO_FUNCTION(xxhash);
-
-PHP_FUNCTION(xxhash32);
-PHP_FUNCTION(xxhash64);
-PHP_FUNCTION(xxhash64Unsigned);
-
-#endif /* PHP_XXHASH_H */
-
diff --git a/xxhash.c b/xxhash.c
deleted file mode 100644
index 511d994..0000000
--- a/xxhash.c
+++ /dev/null
@@ -1,962 +0,0 @@
-/*
-xxHash - Fast Hash algorithm
-Copyright (C) 2012-2015, Yann Collet
-
-BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-* Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You can contact the author at :
-- xxHash source repository : https://github.com/Cyan4973/xxHash
-*/
-
-
-/**************************************
-*  Tuning parameters
-**************************************/
-/* XXH_FORCE_MEMORY_ACCESS
- * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
- * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
- * The below switch allow to select different access method for improved performance.
- * Method 0 (default) : use `memcpy()`. Safe and portable.
- * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
- *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
- * Method 2 : direct access. This method is portable but violate C standard.
- *            It can generate buggy code on targets which generate assembly depending on alignment.
- *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
- * See http://stackoverflow.com/a/32095106/646947 for details.
- * Prefer these methods in priority order (0 > 1 > 2)
- */
-#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
-#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
-#    define XXH_FORCE_MEMORY_ACCESS 2
-#  elif defined(__INTEL_COMPILER) || \
-  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
-#    define XXH_FORCE_MEMORY_ACCESS 1
-#  endif
-#endif
-
-/* XXH_ACCEPT_NULL_INPUT_POINTER :
- * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
- * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
- * By default, this option is disabled. To enable it, uncomment below define :
- */
-/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
-
-/* XXH_FORCE_NATIVE_FORMAT :
- * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
- * Results are therefore identical for little-endian and big-endian CPU.
- * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
- * Should endian-independance be of no importance for your application, you may set the #define below to 1,
- * to improve speed for Big-endian CPU.
- * This option has no impact on Little_Endian CPU.
- */
-#define XXH_FORCE_NATIVE_FORMAT 0
-
-/* XXH_USELESS_ALIGN_BRANCH :
- * This is a minor performance trick, only useful with lots of very small keys.
- * It means : don't make a test between aligned/unaligned, because performance will be the same.
- * It saves one initial branch per hash.
- */
-#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#  define XXH_USELESS_ALIGN_BRANCH 1
-#endif
-
-
-/**************************************
-*  Compiler Specific Options
-***************************************/
-#ifdef _MSC_VER    /* Visual Studio */
-#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
-#  define FORCE_INLINE static __forceinline
-#else
-#  if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
-#    ifdef __GNUC__
-#      define FORCE_INLINE static inline __attribute__((always_inline))
-#    else
-#      define FORCE_INLINE static inline
-#    endif
-#  else
-#    define FORCE_INLINE static
-#  endif /* __STDC_VERSION__ */
-#endif
-
-
-/**************************************
-*  Includes & Memory related functions
-***************************************/
-#include "xxhash.h"
-/* Modify the local functions below should you wish to use some other memory routines */
-/* for malloc(), free() */
-#include <stdlib.h>
-static void* XXH_malloc(size_t s) { return malloc(s); }
-static void  XXH_free  (void* p)  { free(p); }
-/* for memcpy() */
-#include <string.h>
-static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
-
-
-/**************************************
-*  Basic Types
-***************************************/
-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
-# include <stdint.h>
-  typedef uint8_t  BYTE;
-  typedef uint16_t U16;
-  typedef uint32_t U32;
-  typedef  int32_t S32;
-  typedef uint64_t U64;
-#else
-  typedef unsigned char      BYTE;
-  typedef unsigned short     U16;
-  typedef unsigned int       U32;
-  typedef   signed int       S32;
-  typedef unsigned long long U64;
-#endif
-
-
-#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
-
-/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
-static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
-static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
-
-#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
-
-/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
-/* currently only defined for gcc and icc */
-typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
-
-static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
-static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
-
-#else
-
-/* portable and safe solution. Generally efficient.
- * see : http://stackoverflow.com/a/32095106/646947
- */
-
-static U32 XXH_read32(const void* memPtr)
-{
-    U32 val;
-    memcpy(&val, memPtr, sizeof(val));
-    return val;
-}
-
-static U64 XXH_read64(const void* memPtr)
-{
-    U64 val;
-    memcpy(&val, memPtr, sizeof(val));
-    return val;
-}
-
-#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS
-
-
-/******************************************
-*  Compiler-specific Functions and Macros
-******************************************/
-#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-
-/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
-#if defined(_MSC_VER)
-#  define XXH_rotl32(x,r) _rotl(x,r)
-#  define XXH_rotl64(x,r) _rotl64(x,r)
-#else
-#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
-#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
-#endif
-
-#if defined(_MSC_VER)     /* Visual Studio */
-#  define XXH_swap32 _byteswap_ulong
-#  define XXH_swap64 _byteswap_uint64
-#elif GCC_VERSION >= 403
-#  define XXH_swap32 __builtin_bswap32
-#  define XXH_swap64 __builtin_bswap64
-#else
-static U32 XXH_swap32 (U32 x)
-{
-    return  ((x << 24) & 0xff000000 ) |
-            ((x <<  8) & 0x00ff0000 ) |
-            ((x >>  8) & 0x0000ff00 ) |
-            ((x >> 24) & 0x000000ff );
-}
-static U64 XXH_swap64 (U64 x)
-{
-    return  ((x << 56) & 0xff00000000000000ULL) |
-            ((x << 40) & 0x00ff000000000000ULL) |
-            ((x << 24) & 0x0000ff0000000000ULL) |
-            ((x << 8)  & 0x000000ff00000000ULL) |
-            ((x >> 8)  & 0x00000000ff000000ULL) |
-            ((x >> 24) & 0x0000000000ff0000ULL) |
-            ((x >> 40) & 0x000000000000ff00ULL) |
-            ((x >> 56) & 0x00000000000000ffULL);
-}
-#endif
-
-
-/***************************************
-*  Architecture Macros
-***************************************/
-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
-
-/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example one the compiler command line */
-#ifndef XXH_CPU_LITTLE_ENDIAN
-    static const int one = 1;
-#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
-#endif
-
-
-/*****************************
-*  Memory reads
-*****************************/
-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
-
-FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
-{
-    if (align==XXH_unaligned)
-        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
-    else
-        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
-}
-
-FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
-{
-    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
-}
-
-FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
-{
-    if (align==XXH_unaligned)
-        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
-    else
-        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
-}
-
-FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
-{
-    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
-}
-
-
-/***************************************
-*  Macros
-***************************************/
-#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    /* use only *after* variable declarations */
-
-
-/***************************************
-*  Constants
-***************************************/
-#define PRIME32_1   2654435761U
-#define PRIME32_2   2246822519U
-#define PRIME32_3   3266489917U
-#define PRIME32_4    668265263U
-#define PRIME32_5    374761393U
-
-#define PRIME64_1 11400714785074694791ULL
-#define PRIME64_2 14029467366897019727ULL
-#define PRIME64_3  1609587929392839161ULL
-#define PRIME64_4  9650029242287828579ULL
-#define PRIME64_5  2870177450012600261ULL
-
-
-/*****************************
-*  Simple Hash Functions
-*****************************/
-FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
-{
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* bEnd = p + len;
-    U32 h32;
-#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
-
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (p==NULL)
-    {
-        len=0;
-        bEnd=p=(const BYTE*)(size_t)16;
-    }
-#endif
-
-    if (len>=16)
-    {
-        const BYTE* const limit = bEnd - 16;
-        U32 v1 = seed + PRIME32_1 + PRIME32_2;
-        U32 v2 = seed + PRIME32_2;
-        U32 v3 = seed + 0;
-        U32 v4 = seed - PRIME32_1;
-
-        do
-        {
-            v1 += XXH_get32bits(p) * PRIME32_2;
-            v1 = XXH_rotl32(v1, 13);
-            v1 *= PRIME32_1;
-            p+=4;
-            v2 += XXH_get32bits(p) * PRIME32_2;
-            v2 = XXH_rotl32(v2, 13);
-            v2 *= PRIME32_1;
-            p+=4;
-            v3 += XXH_get32bits(p) * PRIME32_2;
-            v3 = XXH_rotl32(v3, 13);
-            v3 *= PRIME32_1;
-            p+=4;
-            v4 += XXH_get32bits(p) * PRIME32_2;
-            v4 = XXH_rotl32(v4, 13);
-            v4 *= PRIME32_1;
-            p+=4;
-        }
-        while (p<=limit);
-
-        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-    }
-    else
-    {
-        h32  = seed + PRIME32_5;
-    }
-
-    h32 += (U32) len;
-
-    while (p+4<=bEnd)
-    {
-        h32 += XXH_get32bits(p) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
-        p+=4;
-    }
-
-    while (p<bEnd)
-    {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-        p++;
-    }
-
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-
-    return h32;
-}
-
-
-unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
-{
-#if 0
-    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
-    XXH32_state_t state;
-    XXH32_reset(&state, seed);
-    XXH32_update(&state, input, len);
-    return XXH32_digest(&state);
-#else
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-#  if !defined(XXH_USELESS_ALIGN_BRANCH)
-    if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
-    {
-        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
-        else
-            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
-    }
-#  endif
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
-    else
-        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
-#endif
-}
-
-FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
-{
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* bEnd = p + len;
-    U64 h64;
-#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
-
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (p==NULL)
-    {
-        len=0;
-        bEnd=p=(const BYTE*)(size_t)32;
-    }
-#endif
-
-    if (len>=32)
-    {
-        const BYTE* const limit = bEnd - 32;
-        U64 v1 = seed + PRIME64_1 + PRIME64_2;
-        U64 v2 = seed + PRIME64_2;
-        U64 v3 = seed + 0;
-        U64 v4 = seed - PRIME64_1;
-
-        do
-        {
-            v1 += XXH_get64bits(p) * PRIME64_2;
-            p+=8;
-            v1 = XXH_rotl64(v1, 31);
-            v1 *= PRIME64_1;
-            v2 += XXH_get64bits(p) * PRIME64_2;
-            p+=8;
-            v2 = XXH_rotl64(v2, 31);
-            v2 *= PRIME64_1;
-            v3 += XXH_get64bits(p) * PRIME64_2;
-            p+=8;
-            v3 = XXH_rotl64(v3, 31);
-            v3 *= PRIME64_1;
-            v4 += XXH_get64bits(p) * PRIME64_2;
-            p+=8;
-            v4 = XXH_rotl64(v4, 31);
-            v4 *= PRIME64_1;
-        }
-        while (p<=limit);
-
-        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
-
-        v1 *= PRIME64_2;
-        v1 = XXH_rotl64(v1, 31);
-        v1 *= PRIME64_1;
-        h64 ^= v1;
-        h64 = h64 * PRIME64_1 + PRIME64_4;
-
-        v2 *= PRIME64_2;
-        v2 = XXH_rotl64(v2, 31);
-        v2 *= PRIME64_1;
-        h64 ^= v2;
-        h64 = h64 * PRIME64_1 + PRIME64_4;
-
-        v3 *= PRIME64_2;
-        v3 = XXH_rotl64(v3, 31);
-        v3 *= PRIME64_1;
-        h64 ^= v3;
-        h64 = h64 * PRIME64_1 + PRIME64_4;
-
-        v4 *= PRIME64_2;
-        v4 = XXH_rotl64(v4, 31);
-        v4 *= PRIME64_1;
-        h64 ^= v4;
-        h64 = h64 * PRIME64_1 + PRIME64_4;
-    }
-    else
-    {
-        h64  = seed + PRIME64_5;
-    }
-
-    h64 += (U64) len;
-
-    while (p+8<=bEnd)
-    {
-        U64 k1 = XXH_get64bits(p);
-        k1 *= PRIME64_2;
-        k1 = XXH_rotl64(k1,31);
-        k1 *= PRIME64_1;
-        h64 ^= k1;
-        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
-        p+=8;
-    }
-
-    if (p+4<=bEnd)
-    {
-        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
-        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-        p+=4;
-    }
-
-    while (p<bEnd)
-    {
-        h64 ^= (*p) * PRIME64_5;
-        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-        p++;
-    }
-
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
-
-    return h64;
-}
-
-
-unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
-{
-#if 0
-    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
-    XXH64_state_t state;
-    XXH64_reset(&state, seed);
-    XXH64_update(&state, input, len);
-    return XXH64_digest(&state);
-#else
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-#  if !defined(XXH_USELESS_ALIGN_BRANCH)
-    if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
-    {
-        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-            return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
-        else
-            return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
-    }
-#  endif
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
-    else
-        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
-#endif
-}
-
-/****************************************************
-*  Advanced Hash Functions
-****************************************************/
-
-/*** Allocation ***/
-typedef struct
-{
-    U64 total_len;
-    U32 seed;
-    U32 v1;
-    U32 v2;
-    U32 v3;
-    U32 v4;
-    U32 mem32[4];   /* defined as U32 for alignment */
-    U32 memsize;
-} XXH_istate32_t;
-
-typedef struct
-{
-    U64 total_len;
-    U64 seed;
-    U64 v1;
-    U64 v2;
-    U64 v3;
-    U64 v4;
-    U64 mem64[4];   /* defined as U64 for alignment */
-    U32 memsize;
-} XXH_istate64_t;
-
-
-XXH32_state_t* XXH32_createState(void)
-{
-    XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   /* A compilation error here means XXH32_state_t is not large enough */
-    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
-}
-XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
-{
-    XXH_free(statePtr);
-    return XXH_OK;
-}
-
-XXH64_state_t* XXH64_createState(void)
-{
-    XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   /* A compilation error here means XXH64_state_t is not large enough */
-    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
-}
-XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
-{
-    XXH_free(statePtr);
-    return XXH_OK;
-}
-
-
-/*** Hash feed ***/
-
-XXH_errorcode XXH32_reset(XXH32_state_t* state_in, unsigned int seed)
-{
-    XXH_istate32_t* state = (XXH_istate32_t*) state_in;
-    state->seed = seed;
-    state->v1 = seed + PRIME32_1 + PRIME32_2;
-    state->v2 = seed + PRIME32_2;
-    state->v3 = seed + 0;
-    state->v4 = seed - PRIME32_1;
-    state->total_len = 0;
-    state->memsize = 0;
-    return XXH_OK;
-}
-
-XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
-{
-    XXH_istate64_t* state = (XXH_istate64_t*) state_in;
-    state->seed = seed;
-    state->v1 = seed + PRIME64_1 + PRIME64_2;
-    state->v2 = seed + PRIME64_2;
-    state->v3 = seed + 0;
-    state->v4 = seed - PRIME64_1;
-    state->total_len = 0;
-    state->memsize = 0;
-    return XXH_OK;
-}
-
-
-FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
-{
-    XXH_istate32_t* state = (XXH_istate32_t *) state_in;
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
-
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (input==NULL) return XXH_ERROR;
-#endif
-
-    state->total_len += len;
-
-    if (state->memsize + len < 16)   /* fill in tmp buffer */
-    {
-        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
-        state->memsize += (U32)len;
-        return XXH_OK;
-    }
-
-    if (state->memsize)   /* some data left from previous update */
-    {
-        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
-        {
-            const U32* p32 = state->mem32;
-            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
-            state->v1 = XXH_rotl32(state->v1, 13);
-            state->v1 *= PRIME32_1;
-            p32++;
-            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
-            state->v2 = XXH_rotl32(state->v2, 13);
-            state->v2 *= PRIME32_1;
-            p32++;
-            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
-            state->v3 = XXH_rotl32(state->v3, 13);
-            state->v3 *= PRIME32_1;
-            p32++;
-            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
-            state->v4 = XXH_rotl32(state->v4, 13);
-            state->v4 *= PRIME32_1;
-            p32++;
-        }
-        p += 16-state->memsize;
-        state->memsize = 0;
-    }
-
-    if (p <= bEnd-16)
-    {
-        const BYTE* const limit = bEnd - 16;
-        U32 v1 = state->v1;
-        U32 v2 = state->v2;
-        U32 v3 = state->v3;
-        U32 v4 = state->v4;
-
-        do
-        {
-            v1 += XXH_readLE32(p, endian) * PRIME32_2;
-            v1 = XXH_rotl32(v1, 13);
-            v1 *= PRIME32_1;
-            p+=4;
-            v2 += XXH_readLE32(p, endian) * PRIME32_2;
-            v2 = XXH_rotl32(v2, 13);
-            v2 *= PRIME32_1;
-            p+=4;
-            v3 += XXH_readLE32(p, endian) * PRIME32_2;
-            v3 = XXH_rotl32(v3, 13);
-            v3 *= PRIME32_1;
-            p+=4;
-            v4 += XXH_readLE32(p, endian) * PRIME32_2;
-            v4 = XXH_rotl32(v4, 13);
-            v4 *= PRIME32_1;
-            p+=4;
-        }
-        while (p<=limit);
-
-        state->v1 = v1;
-        state->v2 = v2;
-        state->v3 = v3;
-        state->v4 = v4;
-    }
-
-    if (p < bEnd)
-    {
-        XXH_memcpy(state->mem32, p, bEnd-p);
-        state->memsize = (int)(bEnd-p);
-    }
-
-    return XXH_OK;
-}
-
-XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
-    else
-        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
-}
-
-
-
-FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
-{
-    const XXH_istate32_t* state = (const XXH_istate32_t*) state_in;
-    const BYTE * p = (const BYTE*)state->mem32;
-    const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize;
-    U32 h32;
-
-    if (state->total_len >= 16)
-    {
-        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
-    }
-    else
-    {
-        h32  = state->seed + PRIME32_5;
-    }
-
-    h32 += (U32) state->total_len;
-
-    while (p+4<=bEnd)
-    {
-        h32 += XXH_readLE32(p, endian) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
-        p+=4;
-    }
-
-    while (p<bEnd)
-    {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
-        p++;
-    }
-
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-
-    return h32;
-}
-
-
-unsigned int XXH32_digest (const XXH32_state_t* state_in)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_digest_endian(state_in, XXH_littleEndian);
-    else
-        return XXH32_digest_endian(state_in, XXH_bigEndian);
-}
-
-
-FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
-{
-    XXH_istate64_t * state = (XXH_istate64_t *) state_in;
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
-
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (input==NULL) return XXH_ERROR;
-#endif
-
-    state->total_len += len;
-
-    if (state->memsize + len < 32)   /* fill in tmp buffer */
-    {
-        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
-        state->memsize += (U32)len;
-        return XXH_OK;
-    }
-
-    if (state->memsize)   /* some data left from previous update */
-    {
-        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
-        {
-            const U64* p64 = state->mem64;
-            state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
-            state->v1 = XXH_rotl64(state->v1, 31);
-            state->v1 *= PRIME64_1;
-            p64++;
-            state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
-            state->v2 = XXH_rotl64(state->v2, 31);
-            state->v2 *= PRIME64_1;
-            p64++;
-            state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
-            state->v3 = XXH_rotl64(state->v3, 31);
-            state->v3 *= PRIME64_1;
-            p64++;
-            state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
-            state->v4 = XXH_rotl64(state->v4, 31);
-            state->v4 *= PRIME64_1;
-            p64++;
-        }
-        p += 32-state->memsize;
-        state->memsize = 0;
-    }
-
-    if (p+32 <= bEnd)
-    {
-        const BYTE* const limit = bEnd - 32;
-        U64 v1 = state->v1;
-        U64 v2 = state->v2;
-        U64 v3 = state->v3;
-        U64 v4 = state->v4;
-
-        do
-        {
-            v1 += XXH_readLE64(p, endian) * PRIME64_2;
-            v1 = XXH_rotl64(v1, 31);
-            v1 *= PRIME64_1;
-            p+=8;
-            v2 += XXH_readLE64(p, endian) * PRIME64_2;
-            v2 = XXH_rotl64(v2, 31);
-            v2 *= PRIME64_1;
-            p+=8;
-            v3 += XXH_readLE64(p, endian) * PRIME64_2;
-            v3 = XXH_rotl64(v3, 31);
-            v3 *= PRIME64_1;
-            p+=8;
-            v4 += XXH_readLE64(p, endian) * PRIME64_2;
-            v4 = XXH_rotl64(v4, 31);
-            v4 *= PRIME64_1;
-            p+=8;
-        }
-        while (p<=limit);
-
-        state->v1 = v1;
-        state->v2 = v2;
-        state->v3 = v3;
-        state->v4 = v4;
-    }
-
-    if (p < bEnd)
-    {
-        XXH_memcpy(state->mem64, p, bEnd-p);
-        state->memsize = (int)(bEnd-p);
-    }
-
-    return XXH_OK;
-}
-
-XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
-    else
-        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
-}
-
-
-
-FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
-{
-    const XXH_istate64_t * state = (const XXH_istate64_t *) state_in;
-    const BYTE * p = (const BYTE*)state->mem64;
-    const BYTE* bEnd = (const BYTE*)state->mem64 + state->memsize;
-    U64 h64;
-
-    if (state->total_len >= 32)
-    {
-        U64 v1 = state->v1;
-        U64 v2 = state->v2;
-        U64 v3 = state->v3;
-        U64 v4 = state->v4;
-
-        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
-
-        v1 *= PRIME64_2;
-        v1 = XXH_rotl64(v1, 31);
-        v1 *= PRIME64_1;
-        h64 ^= v1;
-        h64 = h64*PRIME64_1 + PRIME64_4;
-
-        v2 *= PRIME64_2;
-        v2 = XXH_rotl64(v2, 31);
-        v2 *= PRIME64_1;
-        h64 ^= v2;
-        h64 = h64*PRIME64_1 + PRIME64_4;
-
-        v3 *= PRIME64_2;
-        v3 = XXH_rotl64(v3, 31);
-        v3 *= PRIME64_1;
-        h64 ^= v3;
-        h64 = h64*PRIME64_1 + PRIME64_4;
-
-        v4 *= PRIME64_2;
-        v4 = XXH_rotl64(v4, 31);
-        v4 *= PRIME64_1;
-        h64 ^= v4;
-        h64 = h64*PRIME64_1 + PRIME64_4;
-    }
-    else
-    {
-        h64  = state->seed + PRIME64_5;
-    }
-
-    h64 += (U64) state->total_len;
-
-    while (p+8<=bEnd)
-    {
-        U64 k1 = XXH_readLE64(p, endian);
-        k1 *= PRIME64_2;
-        k1 = XXH_rotl64(k1,31);
-        k1 *= PRIME64_1;
-        h64 ^= k1;
-        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
-        p+=8;
-    }
-
-    if (p+4<=bEnd)
-    {
-        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
-        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-        p+=4;
-    }
-
-    while (p<bEnd)
-    {
-        h64 ^= (*p) * PRIME64_5;
-        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-        p++;
-    }
-
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
-
-    return h64;
-}
-
-
-unsigned long long XXH64_digest (const XXH64_state_t* state_in)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_digest_endian(state_in, XXH_littleEndian);
-    else
-        return XXH64_digest_endian(state_in, XXH_bigEndian);
-}
-
-
diff --git a/xxhash.h b/xxhash.h
deleted file mode 100644
index c60aa61..0000000
--- a/xxhash.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
-   xxHash - Extremely Fast Hash algorithm
-   Header File
-   Copyright (C) 2012-2015, Yann Collet.
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - xxHash source repository : https://github.com/Cyan4973/xxHash
-*/
-
-/* Notice extracted from xxHash homepage :
-
-xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
-It also successfully passes all tests from the SMHasher suite.
-
-Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
-
-Name            Speed       Q.Score   Author
-xxHash          5.4 GB/s     10
-CrapWow         3.2 GB/s      2       Andrew
-MumurHash 3a    2.7 GB/s     10       Austin Appleby
-SpookyHash      2.0 GB/s     10       Bob Jenkins
-SBox            1.4 GB/s      9       Bret Mulvey
-Lookup3         1.2 GB/s      9       Bob Jenkins
-SuperFastHash   1.2 GB/s      1       Paul Hsieh
-CityHash64      1.05 GB/s    10       Pike & Alakuijala
-FNV             0.55 GB/s     5       Fowler, Noll, Vo
-CRC32           0.43 GB/s     9
-MD5-32          0.33 GB/s    10       Ronald L. Rivest
-SHA1-32         0.28 GB/s    10
-
-Q.Score is a measure of quality of the hash function.
-It depends on successfully passing SMHasher test set.
-10 is a perfect score.
-
-A 64-bits version, named XXH64, is available since r35.
-It offers much better speed, but for 64-bits applications only.
-Name     Speed on 64 bits    Speed on 32 bits
-XXH64       13.8 GB/s            1.9 GB/s
-XXH32        6.8 GB/s            6.0 GB/s
-*/
-
-#pragma once
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-
-/*****************************
-*  Definitions
-*****************************/
-#include <stddef.h>   /* size_t */
-typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
-
-
-/*****************************
-*  Namespace Emulation
-*****************************/
-/* Motivations :
-
-If you need to include xxHash into your library,
-but wish to avoid xxHash symbols to be present on your library interface
-in an effort to avoid potential name collision if another library also includes xxHash,
-
-you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
-with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
-
-Note that no change is required within the calling program :
-it can still call xxHash functions using their regular name.
-They will be automatically translated by this header.
-*/
-#ifdef XXH_NAMESPACE
-#  define XXH_CAT(A,B) A##B
-#  define XXH_NAME2(A,B) XXH_CAT(A,B)
-#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
-#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
-#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
-#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
-#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
-#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
-#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
-#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
-#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
-#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
-#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
-#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
-#endif
-
-
-/*****************************
-*  Simple Hash Functions
-*****************************/
-
-unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
-unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
-
-/*
-XXH32() :
-    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
-    The memory between input & input+length must be valid (allocated and read-accessible).
-    "seed" can be used to alter the result predictably.
-    This function successfully passes all SMHasher tests.
-    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-XXH64() :
-    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
-    Faster on 64-bits systems. Slower on 32-bits systems.
-*/
-
-
-
-/*****************************
-*  Advanced Hash Functions
-*****************************/
-typedef struct { long long ll[ 6]; } XXH32_state_t;
-typedef struct { long long ll[11]; } XXH64_state_t;
-
-/*
-These structures allow static allocation of XXH states.
-States must then be initialized using XXHnn_reset() before first use.
-
-If you prefer dynamic allocation, please refer to functions below.
-*/
-
-XXH32_state_t* XXH32_createState(void);
-XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
-
-XXH64_state_t* XXH64_createState(void);
-XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
-
-/*
-These functions create and release memory for XXH state.
-States must then be initialized using XXHnn_reset() before first use.
-*/
-
-
-XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
-XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
-unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
-
-XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
-XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
-unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
-
-/*
-These functions calculate the xxHash of an input provided in multiple smaller packets,
-as opposed to an input provided as a single block.
-
-XXH state space must first be allocated, using either static or dynamic method provided above.
-
-Start a new hash by initializing state with a seed, using XXHnn_reset().
-
-Then, feed the hash state by calling XXHnn_update() as many times as necessary.
-Obviously, input must be valid, meaning allocated and read accessible.
-The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
-
-Finally, you can produce a hash anytime, by using XXHnn_digest().
-This function returns the final nn-bits hash.
-You can nonetheless continue feeding the hash state with more input,
-and therefore get some new hashes, by calling again XXHnn_digest().
-
-When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
-*/
-
-
-#if defined (__cplusplus)
-}
-#endif

From 6751943007993e8b3bf4b0453e6642f8ae67e37c Mon Sep 17 00:00:00 2001
From: Nir Heimann <nir@gobeepit.com>
Date: Sun, 20 Dec 2020 13:42:26 +0200
Subject: [PATCH 20/21] Adding again the unsigned hashing option as string (PHP
 doesn't support unsigned long)

---
 config.m4    | 24 ++++++++++++++++++++++++
 php_xxhash.c | 21 +++++++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 config.m4

diff --git a/config.m4 b/config.m4
new file mode 100644
index 0000000..6a15a0a
--- /dev/null
+++ b/config.m4
@@ -0,0 +1,24 @@
+dnl $Id$
+dnl config.m4 for extension xxhash
+
+dnl Comments in this file start with the string 'dnl'.
+dnl Remove where necessary. This file will not work
+dnl without editing.
+
+dnl If your extension references something external, use with:
+
+dnl PHP_ARG_WITH(xxhash, for xxhash support,
+dnl Make sure that the comment is aligned:
+dnl [  --with-xxhash             Include xxhash support])
+
+dnl Otherwise use enable:
+
+PHP_ARG_ENABLE(xxhash, whether to enable xxhash support,
+dnl Make sure that the comment is aligned:
+[  --enable-xxhash            Enable xxhash support])
+
+if test "$PHP_XXHASH" != "no"; then
+  dnl Write more examples of tests here...
+
+  PHP_NEW_EXTENSION(xxhash, php_xxhash.c, $ext_shared)
+fi
\ No newline at end of file
diff --git a/php_xxhash.c b/php_xxhash.c
index c0f8c78..837258b 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -62,9 +62,30 @@ PHP_FUNCTION(xxhash64)
 	RETURN_LONG(sum);
 }
 
+PHP_FUNCTION(xxhash64Unsigned)
+{
+	char *arg = NULL;
+    size_t arg_len, len;
+    zend_string *strg;
+	unsigned long long sum;
+
+    if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE || arg_len < 1) {
+        return;
+    }
+
+	// compute the checksum
+	sum = XXH64(arg, arg_len, 0);
+	
+	//Since php doesn't have unsinged long values, the value will be returned as string
+	char numberAsAString[21];
+	sprintf(numberAsAString, "%llu",sum);
+	RETURN_STRING(numberAsAString);
+}
+
 const zend_function_entry xxhash_functions[] = {
 	ZEND_FE(xxhash32, NULL)
 	ZEND_FE(xxhash64, NULL)
+	ZEND_FE(xxhash64Unsigned, NULL)
 	PHP_FE_END
 };
 

From 7ab02434922cc4553ca46383b8ff2a77b67361c0 Mon Sep 17 00:00:00 2001
From: Nir Heimann <nir@thehaystackapp.com>
Date: Fri, 4 Oct 2024 16:59:58 +0100
Subject: [PATCH 21/21] Adding arguments info, without doing so a warning will
 be shown on PHP 8

---
 php_xxhash.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/php_xxhash.c b/php_xxhash.c
index 837258b..d47cd6c 100644
--- a/php_xxhash.c
+++ b/php_xxhash.c
@@ -82,10 +82,14 @@ PHP_FUNCTION(xxhash64Unsigned)
 	RETURN_STRING(numberAsAString);
 }
 
+ZEND_BEGIN_ARG_INFO_EX(arginfo_xxhash, 0, 0, 1)
+	ZEND_ARG_TYPE_INFO(0, data, IS_STRING, 0)
+ZEND_END_ARG_INFO()
+
 const zend_function_entry xxhash_functions[] = {
-	ZEND_FE(xxhash32, NULL)
-	ZEND_FE(xxhash64, NULL)
-	ZEND_FE(xxhash64Unsigned, NULL)
+	ZEND_FE(xxhash32, arginfo_xxhash)
+	ZEND_FE(xxhash64, arginfo_xxhash)
+	ZEND_FE(xxhash64Unsigned, arginfo_xxhash)
 	PHP_FE_END
 };