/*
 * Schism Tracker - a cross-platform Impulse Tracker clone
 * copyright (c) 2003-2005 Storlek <storlek@rigelseven.com>
 * copyright (c) 2005-2008 Mrs. Brisby <mrs.brisby@nimh.org>
 * copyright (c) 2009 Storlek & Mrs. Brisby
 * copyright (c) 2010-2012 Storlek
 * URL: http://schismtracker.org/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/* This is just a collection of some useful functions. None of these use any
extraneous libraries (i.e. GLib). */

#include "headers.h"
#include "util.h"
#include "osdefs.h"
#include "mem.h"
#include "cpu.h"
#include "mt.h"

#if defined(HAVE_MKSTEMP) && defined(HAVE_FDOPEN) && !defined(SCHISM_WIN32)
# define MKFSTEMP_USE_MKSTEMP 1
#endif

#ifdef HAVE_UMASK
static mt_mutex_t *umask_mutex = NULL;

# ifdef SCHISM_WIN32
#  define umask _umask
# endif
#endif

int util_getumask(mode_t *pmask)
{
#ifdef HAVE_UMASK
	mode_t mask;

	if (!umask_mutex)
		return -1; /* call util_initumask() */

	mt_mutex_lock(umask_mutex);
	mask = umask(0777);
	umask(mask);
	mt_mutex_unlock(umask_mutex);

	*pmask = mask;

	return 0;
#else
	/* make something up */
	*pmask = 0022;
	return 0;
#endif
}

/* we don't really need this.. */
int util_setumask(mode_t mask)
{
#ifdef HAVE_UMASK
	if (!umask_mutex)
		return -1; /* call util_initumask() */

	mt_mutex_lock(umask_mutex);
	umask(mask);
	mt_mutex_unlock(umask_mutex);

	return 0;
#else
	return -1;

	(void)mask;
#endif
}

int util_initumask(void)
{
#ifdef HAVE_UMASK
	umask_mutex = mt_mutex_create();
	if (!umask_mutex)
		return -1;
#endif

	return 0;
}

void util_quitumask(void)
{
#ifdef HAVE_UMASK
	if (umask_mutex)
		mt_mutex_delete(umask_mutex);
#endif
}

/* This function is roughly equivalent to the mkstemp() function on POSIX
 * operating systems, but instead of returning a file descriptor it returns
 * a C stdio file pointer. It also receives a file mask to set the file to,
 * but it's completely ignored on systems that don't implement mkstemp(). */
FILE *mkfstemp(char *template)
{
#ifdef MKFSTEMP_USE_MKSTEMP
	/* Just forward to mkstemp; by definition an fclose() on a FILE * returned
	 * by fdopen() will close the file descriptor as well. */
	int fd;

	fd = mkstemp(template);
	if (fd < 0)
		return NULL;

	return fdopen(fd, "w+b");
#else
	static const char letters[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
	static const size_t letters_len = ARRAY_SIZE(letters) - 1;
	static uint64_t value;
	int count;

	size_t len = strlen(template);
	if (len < 6 || strcmp(&template[len - 6], "XXXXXX")) {
		errno = EINVAL;
		return NULL;
	}

	/* This is where the Xs start.  */
	char *XXXXXX = &template[len - 6];

	/* rand() is already initialized by the time we use this function */
	value += rand();

	for (count = 0; count < TMP_MAX; ++count) {
		uint64_t v = value;
		FILE *fp;

		/* Fill in the random bits.  */
		XXXXXX[0] = letters[v % letters_len];
		v /= letters_len;
		XXXXXX[1] = letters[v % letters_len];
		v /= letters_len;
		XXXXXX[2] = letters[v % letters_len];
		v /= letters_len;
		XXXXXX[3] = letters[v % letters_len];
		v /= letters_len;
		XXXXXX[4] = letters[v % letters_len];
		v /= letters_len;
		XXXXXX[5] = letters[v % letters_len];

		// NOTE: C11 added a new subspecifier "x" that
		// can be used to fail if the file exists. this
		// isn't very useful for us though, since we're
		// C99...
		fp = os_fopen(template, "rb");
		if (!fp) {
			// it doesn't exist! open in write mode
			fp = os_fopen(template, "w+b");
			if (fp)
				return fp;
		}

		/* This is a random value.  It is only necessary that the next
		 * TMP_MAX values generated by adding 7777 to VALUE are different
		 * with (module 2^32).  */
		value += 7777;
	}

	/* We return the null string if we can't find a unique file name.  */
	template[0] = '\0';
	return NULL;
#endif
}

static void util_envvar_helper(const char *name, const char *val)
{
	if (val) {
		(void)setenv(name, val, 1);
	} else {
		(void)unsetenv(name);
	}
}

/* this is used for hacking around SDL's stupid envvar crap. */
int util_call_func_with_envvar(int (*cb)(void *p), void *p, const char *name,
	const char *val)
{
	char *orig;
	int ret;

	SCHISM_RUNTIME_ASSERT(name != NULL, "need an envvar to set");

	{
		const char *x = getenv(name);

		orig = (x) ? str_dup(x) : NULL;
	}

	/* XXX: should `val` being NULL unset the envvar, or just do nothing ? */
	util_envvar_helper(name, val);

	ret = cb(p);

	/* clean up our dirty work, or empty the var */
	util_envvar_helper(name, orig);

	free(orig);

	/* forward any error, if any */
	return ret;
}

/* ------------------------------------------------------------------------ */

/* uses os_show_message_box to show a formatted string */
int msgboxv(int style, const char *title, const char *fmt, va_list ap)
{
	char *s;
	if (vasprintf(&s, fmt, ap) < 0)
		return -1;

	os_show_message_box(title, s, style);

	free(s);

	return 0;
}

int msgbox(int style, const char *title, const char *fmt, ...)
{
	va_list ap;
	int r;

	va_start(ap, fmt);
	r = msgboxv(style, title, fmt, ap);
	va_end(ap);

	return r;
}

/* ------------------------------------------------------------------------ */
/* mem_xor: XORs all of the bytes in vbuf.
 * uses SIMD-enhanced versions if possible. */

#define MEM_XOR(attributes, name) \
	attributes static void mem_xor_##name(void *vbuf, size_t len, unsigned char c) \
	{ \
		unsigned char *buf = vbuf; \
	\
		if (len >= 4) { \
			size_t len8; \
			uint32_t cccc; \
	\
			/* expand to all bytes */ \
			cccc = c; \
			cccc |= (cccc << 8); \
			cccc |= (cccc << 16); \
	\
			/* align the pointer */ \
			for (; (uintptr_t)buf % sizeof(uint32_t); len--) \
				*(buf++) ^= c; \
	\
			/* process in chunks of 8 32-bit integers */ \
			for (len8 = (len / (sizeof(uint32_t) * 8)); len8 > 0; len8--) { \
				((uint32_t *)buf)[0] ^= cccc; \
				((uint32_t *)buf)[1] ^= cccc; \
				((uint32_t *)buf)[2] ^= cccc; \
				((uint32_t *)buf)[3] ^= cccc; \
				((uint32_t *)buf)[4] ^= cccc; \
				((uint32_t *)buf)[5] ^= cccc; \
				((uint32_t *)buf)[6] ^= cccc; \
				((uint32_t *)buf)[7] ^= cccc; \
				buf += (8 * sizeof(uint32_t)); \
			} \
			len %= (sizeof(uint32_t) * 8); \
	\
			/* process in chunks of 32-bit integers */ \
			for (len8 = len / sizeof(uint32_t); len8 > 0; len8--) { \
				((uint32_t *)buf)[0] ^= cccc; \
				buf += sizeof(uint32_t); \
			} \
			len %= sizeof(uint32_t); \
		} \
	\
		/* process any that remain */ \
		for (; len > 0; len--) \
			*(buf++) ^= c; \
	}

#if SCHISM_GNUC_HAS_ATTRIBUTE(__target__, 4, 4, 0)
# ifdef SCHISM_SSE2
MEM_XOR(__attribute__((__target__("sse2"))), sse2)
#  define MEM_XOR_SSE2
# endif
# ifdef SCHISM_AVX2
MEM_XOR(__attribute__((__target__("avx2"))), avx2)
#  define MEM_XOR_AVX2
# endif
# ifdef SCHISM_ALTIVEC
MEM_XOR(__attribute__((__target__("altivec"))), altivec)
#  define MEM_XOR_ALTIVEC
# endif
#endif

MEM_XOR(/* nothing */, c)

#undef MEM_XOR

void mem_xor(void *vbuf, size_t len, unsigned char c)
{
#ifdef MEM_XOR_AVX2
	if (cpu_has_feature(CPU_FEATURE_AVX2)) {
		mem_xor_avx2(vbuf, len, c);
		return;
	}
#endif
#ifdef MEM_XOR_SSE2
	if (cpu_has_feature(CPU_FEATURE_SSE2)) {
		mem_xor_sse2(vbuf, len, c);
		return;
	}
#endif
#ifdef MEM_XOR_ALTIVEC
	if (cpu_has_feature(CPU_FEATURE_ALTIVEC)) {
		mem_xor_altivec(vbuf, len, c);
		return;
	}
#endif

	/* fallback to plain C implementation */
	mem_xor_c(vbuf, len, c);
}

#undef MEM_XOR_AVX2
#undef MEM_XOR_SSE2
#undef MEM_XOR_ALTIVEC

/* ------------------------------------------------------------------------ */
/* min/max of a buffer.
 *
 * for a 16-bit sample, SSE2 is about four times as fast as plain C.
 * AVX2 is about twice as fast as SSE2. */

#define MINMAX_C(BITS) \
	static inline SCHISM_ALWAYS_INLINE \
	void minmax_##BITS##_c(const int##BITS##_t *data, size_t len, \
		int##BITS##_t *min, int##BITS##_t *max, size_t stride) \
	{ \
		size_t i; \
	\
		for (i = 0; i < len; i += stride) { \
			if (data[i] < *min) *min = data[i]; \
			if (data[i] > *max) *max = data[i]; \
		} \
	}

MINMAX_C(8)
MINMAX_C(16)
MINMAX_C(32)

#undef MINMAX_C

#if SCHISM_GNUC_HAS_ATTRIBUTE(__target__, 4, 4, 0) \
	&& !defined(SCHISM_XBOX) /* XBOX is hardcoded to i586 */ \
	&& (defined(__x86_64__) || defined(__i386__)) /* clang on macosx LIES */

# include <immintrin.h>

# define MINMAX_X86_INTRINSICS(TARGET, NAME, TYPE, BITS, SIZE, VARS, PREFIX, SUFFIX, PREPROCESS, SET1, LOADU, MIN, MAX, STORE) \
	__attribute__((__target__(#TARGET))) \
	static void minmax_##BITS##_##NAME(const int##BITS##_t *buf, size_t len, int##BITS##_t *min, int##BITS##_t *max, size_t stride) \
	{ \
		size_t i; \
	\
		if (!len) return; /* wat */ \
	\
		if (len >= SIZE \
				&& stride < SIZE /* stride cannot be over SIZE */ \
				&& !(stride & (stride - 1))) /* stride must be a power of 2 */ \
		{ \
			size_t xlen; \
			TYPE vmin; \
			TYPE vmax; \
			__attribute__((__aligned__(SIZE * (BITS / 8)))) int##BITS##_t amin[SIZE]; \
			__attribute__((__aligned__(SIZE * (BITS / 8)))) int##BITS##_t amax[SIZE]; \
			VARS \
\
			PREFIX \
\
			/* load the min and unsign it */ \
			vmin = SET1(*min); \
			vmax = SET1(*max); \
\
			/* kludge it in */ \
			for (xlen = len / SIZE; xlen > 0; xlen--) { \
				TYPE x; \
\
				x = LOADU((const TYPE *)buf); \
				PREPROCESS \
\
				vmin = MIN(vmin, x); \
				vmax = MAX(vmax, x); \
\
				buf += SIZE; \
			} \
\
			len %= SIZE; \
\
			SUFFIX \
\
			/* TODO: do this in the actual vector so that \
			 * we can just extract the first value */ \
			STORE((TYPE *)amin, vmin); \
			STORE((TYPE *)amax, vmax); \
\
			for (i = 0; i < SIZE; i += stride) { \
				if (amin[i] < *min) *min = amin[i]; \
				if (amax[i] > *max) *max = amax[i]; \
			} \
		} \
\
		/* process the rest */ \
		minmax_##BITS##_c(buf, len, min, max, stride); \
	}

# ifdef SCHISM_SSE2
/* circa 2000 (pentium 4) */

/* 8-bit SSE2 */
MINMAX_X86_INTRINSICS(sse2, sse2, __m128i, 8, 16,
	/* vars */
	__m128i msb;,
{
	/* prefix */
	msb = _mm_set1_epi8(0x80);

	*min ^= 0x80;
	*max ^= 0x80;
}, {
	/* suffix */
	vmin = _mm_xor_si128(vmin, msb);
	vmax = _mm_xor_si128(vmax, msb);

	*min ^= 0x80;
	*max ^= 0x80;
}, {
	/* process */
	x = _mm_xor_si128(x, msb);
}, _mm_set1_epi8, _mm_loadu_si128, _mm_min_epu8, _mm_max_epu8, _mm_store_si128)

/* 16-bit SSE2 */
MINMAX_X86_INTRINSICS(sse2, sse2, __m128i, 16, 8,
	/* nothing */, /* nothing */, /* nothing */, /* nothing */,
	_mm_set1_epi16, _mm_loadu_si128, _mm_min_epi16, _mm_max_epi16, _mm_store_si128)

#  define MINMAX_SSE2
# endif
# ifdef SCHISM_SSE41
/* circa 2006, simply adds min/max for signed 8-bit so we don't have to XOR */

/* 8-bit SSE 4.1 */
MINMAX_X86_INTRINSICS(sse4.1, sse41, __m128i, 8, 16,
	/* nothing */, /* nothing */, /* nothing */, /* nothing */,
	_mm_set1_epi8, _mm_loadu_si128, _mm_min_epi8, _mm_max_epi8, _mm_store_si128)
#  define MINMAX_SSE41
# endif
# ifdef SCHISM_AVX2
/* circa 2011 */

/* 8-bit AVX2 */
MINMAX_X86_INTRINSICS(avx2, avx2, __m256i, 8, 32,
	/* nothing */, /* nothing */, /* nothing */, /* nothing */,
	_mm256_set1_epi8, _mm256_loadu_si256, _mm256_min_epi8, _mm256_max_epi8, _mm256_store_si256)

/* 16-bit AVX2 */
MINMAX_X86_INTRINSICS(avx2, avx2, __m256i, 16, 16,
	/* nothing */, /* nothing */, /* nothing */, /* nothing */,
	_mm256_set1_epi16, _mm256_loadu_si256, _mm256_min_epi16, _mm256_max_epi16, _mm256_store_si256)

#  define MINMAX_AVX2
# endif
# ifdef SCHISM_AVX512BW
/* circa 2016, super fast */

MINMAX_X86_INTRINSICS(avx512bw, avx512bw, __m512i, 8, 64,
	/* nothing */, /* nothing */, /* nothing */, /* nothing */,
	_mm512_set1_epi8, _mm512_loadu_si512, _mm512_min_epi8, _mm512_max_epi8, _mm512_store_si512)

MINMAX_X86_INTRINSICS(avx512bw, avx512bw, __m512i, 16, 32,
	/* nothing */, /* nothing */, /* nothing */, /* nothing */,
	_mm512_set1_epi16, _mm512_loadu_si512, _mm512_min_epi16, _mm512_max_epi16, _mm512_store_si512)

#  define MINMAX_AVX512BW
# endif
#endif

void minmax_8(const int8_t *buf, size_t len, int8_t *min, int8_t *max,
	size_t stride)
{
	/* NOTE: for stride > 2, plain C code is faster than avx2 or sse2. */
#ifdef MINMAX_AVX512BW
	if (cpu_has_feature(CPU_FEATURE_AVX512BW)) {
		minmax_8_avx512bw(buf, len, min, max, stride);
		return;
	}
#endif
#ifdef MINMAX_AVX2
	if (cpu_has_feature(CPU_FEATURE_AVX2)) {
		minmax_8_avx2(buf, len, min, max, stride);
		return;
	}
#endif
#ifdef MINMAX_SSE41
	if (cpu_has_feature(CPU_FEATURE_SSE41)) {
		minmax_8_sse41(buf, len, min, max, stride);
		return;
	}
#endif
#ifdef MINMAX_SSE2
	if (cpu_has_feature(CPU_FEATURE_SSE2)) {
		minmax_8_sse2(buf, len, min, max, stride);
		return;
	}
#endif

	minmax_8_c(buf, len, min, max, stride);
}

void minmax_16(const int16_t *buf, size_t len, int16_t *min, int16_t *max,
	size_t stride)
{
	/* NOTE: for stride > 2, plain C code is faster than avx2 or sse2. */
#ifdef MINMAX_AVX512BW
	if (cpu_has_feature(CPU_FEATURE_AVX512BW)) {
		minmax_16_avx512bw(buf, len, min, max, stride);
		return;
	}
#endif
#ifdef MINMAX_AVX2
	if (cpu_has_feature(CPU_FEATURE_AVX2)) {
		minmax_16_avx2(buf, len, min, max, stride);
		return;
	}
#endif
#ifdef MINMAX_SSE2
	if (cpu_has_feature(CPU_FEATURE_SSE2)) {
		minmax_16_sse2(buf, len, min, max, stride);
		return;
	}
#endif

	minmax_16_c(buf, len, min, max, stride);
}

void minmax_32(const int32_t *buf, size_t len, int32_t *min, int32_t *max,
	size_t stride)
{
	/* TODO: vectorized versions. */
	minmax_32_c(buf, len, min, max, stride);
}
