
#ifndef RAYTRACE_COMMONS_H
#define RAYTRACE_COMMONS_H

#include <shaders/commons_hlsl.glsl>
#include "raytrace_setup.glsl"

uint rt_calculate_grid_cell_idx(uint cx, uint cy, uint cz)
{
	return cx + cy * GRID_RES + cz * GRID_RES  * GRID_RES;
}

uint rt_calculate_grid_high_cell_idx(uint cx, uint cy, uint cz)
{
	return (cx/4) + (cy/4) * (GRID_RES/4) + (cz/4) * (GRID_RES/4)  * (GRID_RES/4);
}

// -- faces link list handling --------------------------------------------------------------------------------------

layout (std430) BUFF_ATTR buffer FacesLinkedListBuffer {
	uint buffer_counter;									// global 'allocation' counter
	uint occupied_cell_counter;
	uint _pad0;
	uint _pad1;
	uint _pad2;
	uint _pad3;
	uint _pad4;
	uint _pad5;
	uint node_buffer[];										// first element in the bucket is the alloc
} in_faces_list_data;

layout (std430) BUFF_ATTR buffer FacesLinkedListTailsBuffer {
    uint in_faces_list_tails_data[];						// location for each cell's bucked in 'in_faces_list_data'
};

#ifndef RT_READ_ONLY

void rt_link_list_init(uint list_index)
{
	// this is called in separate pass so lets be lazy. maybe it will be worth to optimize this
	in_faces_list_tails_data[list_index] = 0;
}

void rt_link_list_push_value(uint list_index, uint value)
{
	// allocate
	uint head = in_faces_list_tails_data[list_index];
	//if (in_faces_list_data.node_buffer[head] != -1)
	{
		uint cntr = atomicAdd(in_faces_list_data.node_buffer[head], 1);
		in_faces_list_data.node_buffer[head + cntr + 1] = value;
	}
}

uint rt_link_list_increase_count(uint list_index)
{
	return atomicAdd(in_faces_list_tails_data[list_index], 1);
}

uint rt_link_list_get_count(uint list_index)
{
	return in_faces_list_tails_data[list_index];
}

#endif

void rt_pack_grid_marker_to_uint_img(ivec3 p, out ivec3 p_packed, out uint bit)
{
	// pack into 32bit, so we can scramble 5 lowest bits. use 4*4*2 box, 2:2:1 bits
	ivec3 p_low_bits = p & ivec3(3, 3, 1);

	p_packed = ivec3(p.x >> 2, p.y >> 2, p.z >> 1);
	bit = uint(p_low_bits.x + p_low_bits.y * 4 + p_low_bits.z * 4 * 4);
}

void rt_pack_grid_marker_to_uint_buff(uvec3 p, uint grid_res, out uint p_packed, out uint bit)
{
	// swizzle so that each 4x4x2 block is linearized (32bits into single uint)
	uvec3 p_low_bits  = p & uvec3(3, 3, 1);
	uvec3 p_high_bits = uvec3(p.x >> 2, p.y >> 2, p.z >> 1);

	uint high_linearized = p_high_bits.x + p_high_bits.y * (grid_res / 4) + p_high_bits.z * (grid_res / 4) * (grid_res / 4);
	
	p_packed = high_linearized;
	bit = uint(p_low_bits.x + p_low_bits.y * 4 + p_low_bits.z * 4 * 4);
}

uint rt_pack_2x2x2_grid_to_index(ivec3 p)
{
	// swizzle so that each 2x2x2 block is linearized
	ivec3 p_low_bits  = p & ivec3(1, 1, 1);
	ivec3 p_high_bits = ivec3(p.x >> 1, p.y >> 1, p.z >> 1);

	uint low_linearized  = p_low_bits.x  + p_low_bits.y * 2                + p_low_bits.z * 4;		// 0...7
	uint high_linearized = p_high_bits.x + p_high_bits.y * (GRID_RES / 2)  + p_low_bits.z * (GRID_RES / 2) * (GRID_RES / 2);
	
	return (high_linearized << 3) + low_linearized;
}

//#ifdef RT_USE_TEXTURE_GRID_MARKERS

//void rt_write_grid_marker_high_res(restrict uimage3D markers, ivec3 pos)
//{
//	// TODO:swizzle pos in both write and read, after useage is unified
//	imageStore(markers, pos, ivec4(1));
//}

#define rt_write_grid_marker_high_res(markers, pos) \
do{ \
	ivec3 pos_packed;	\
	uint  bit; \
	rt_pack_grid_marker_to_uint_img(pos, pos_packed, bit);	\
	imageAtomicOr(markers, pos_packed, (1U << bit)); \
} while(false)

#define rt_write_grid_marker_low_res(markers, pos) \
do{ \
	ivec3 pos_packed; \
	uint  bit;	\
	rt_pack_grid_marker_to_uint_img(pos, pos_packed, bit);	\
	imageAtomicOr(markers, pos_packed, (1U << bit)); \
} \
while(false)

//uint rt_read_grid_marker_high_res(layout(r32ui) restrict uimage3D markers, ivec3 pos)
//{
//	return imageLoad(markers, pos).r;
//}

bool rt_read_grid_marker_high_res(usampler3D markers, ivec3 pos)
{
	#if 0
	return texelFetch(markers, pos, 0).r != 0 ? true : false;
	#else
	ivec3 pos_packed;
	uint  bit;
	rt_pack_grid_marker_to_uint_img(pos, pos_packed, bit);
	return (texelFetch(markers, pos_packed, 0).r & (1U << bit)) != 0 ? true : false;
	#endif
}

bool rt_read_grid_marker_low_res(usampler3D markers, ivec3 pos)
{
	#if 0
	return texelFetch(markers, pos, 2).r != 0 ? true : false;
	#else
	ivec3 pos_packed;
	uint  bit;
	rt_pack_grid_marker_to_uint_img(pos, pos_packed, bit);
	return (texelFetch(markers, pos_packed, 2).r & (1 << bit)) != 0 ? true : false;
	#endif
}

//#endif
//#else // RT_USE_TEXTURE_GRID_MARKERS == false

bool _rt_grid_marker_validate(uint64_t markers_ptr)
{
#ifdef _DEBUG
	if (markers_ptr == 0)
	{
		#if defined(GL_EXT_debug_printf)
		debugPrintfEXT("Error grid markers buffer is not specified\\n");
		#endif
		return false;
	}
#endif
	return true;
}

#ifndef RT_READ_ONLY
void _rt_write_grid_marker_high_res(uint64_t markers_ptr, ivec3 pos)
{
	if (_rt_grid_marker_validate(markers_ptr) == false)
		return;

	uvec3 upos = uvec3(pos);
	if (upos.x >= GRID_RES || upos.y >= GRID_RES || upos.z >= GRID_RES)
		return;

	uint pos_packed;
	uint bit;
	rt_pack_grid_marker_to_uint_buff(upos, GRID_RES, pos_packed, bit);
	RT_BuffPointerToU32 markers = RT_BuffPointerToU32(markers_ptr + pos_packed * 4);
	// don't compare with previous state as we gate the atomic anyway by checking if this is first write (faces counter)
	atomicOr(markers.v, (1U << bit));
}

void _rt_write_grid_marker_low_res(uint64_t markers_ptr, ivec3 pos)
{
	if (_rt_grid_marker_validate(markers_ptr) == false)
		return;

	uvec3 upos = uvec3(pos);
	if (upos.x >= GRID_RES / 4 || upos.y >= GRID_RES / 4 || upos.z >= GRID_RES / 4)
		return;

	uint pos_packed;
	uint bit;
	rt_pack_grid_marker_to_uint_buff(upos, GRID_RES / 4, pos_packed, bit);
	RT_BuffPointerToU32 markers = RT_BuffPointerToU32(markers_ptr + pos_packed * 4 + (GRID_RES) * (GRID_RES) * (GRID_RES) / 8);
	if ((markers.v & (1U << bit)) == 0)
		atomicOr(markers.v, (1U << bit));
}
#endif

bool _rt_read_grid_marker_high_res(uint64_t markers_ptr, ivec3 pos)
{
	if (_rt_grid_marker_validate(markers_ptr) == false)
		return false;

	uvec3 upos = uvec3(pos);
	if (max(upos.x, max(upos.y, upos.z)) >= GRID_RES)
		return false;

	uint pos_packed;
	uint bit;
	rt_pack_grid_marker_to_uint_buff(upos, GRID_RES, pos_packed, bit);
	RT_BuffPointerToU32 markers = RT_BuffPointerToU32(markers_ptr + pos_packed * 4);
	return (markers.v & (1U << bit)) != 0 ? true : false;
}

bool _rt_read_grid_marker_low_res(uint64_t markers_ptr, ivec3 pos)
{
	if (_rt_grid_marker_validate(markers_ptr) == false)
		return false;

	uvec3 upos = uvec3(pos);
	if (max(upos.x, max(upos.y, upos.z)) >= GRID_RES / 4)
		return false;

	uint pos_packed;
	uint bit;
	rt_pack_grid_marker_to_uint_buff(upos, GRID_RES / 4, pos_packed, bit);
	RT_BuffPointerToU32 markers = RT_BuffPointerToU32(markers_ptr + pos_packed * 4 + (GRID_RES) * (GRID_RES) * (GRID_RES) / 8);
	return (markers.v & (1 << bit)) != 0 ? true : false;
}

// #endif // RT_USE_TEXTURE_GRID_MARKERS

// ------------------------------------------------------------------------------------------------------------------

vec3 rt_get_vertex(uint idx)
{
	uint coord_offset = idx * VERTEX_COORD_STRIDE_FLOATS + VERTEX_COORD_OFFSET_FLOATS;
	vec3 p = vec3(in_vtx_data[coord_offset + 0], in_vtx_data[coord_offset + 1], in_vtx_data[coord_offset + 2]);

	return p;
}

// NOTE: Needs to match put_normal_raytrace() in 0texture_geometry_transformation.gsh
vec3 rt_get_vertex_normal(uint idx)
{
	uint normal_offset = idx * VERTEX_NORMAL_STRIDE_FLOATS + VERTEX_NORMAL_OFFSET_FLOATS;
	uint n1 = asuint(in_vtx_data[normal_offset + 0]);
	uint n2 = asuint(in_vtx_data[normal_offset + 1]);
	vec3 p = vec3(unpackSnorm2x16(n1).xy,unpackSnorm2x16(n2).x); 
	return p;
}

vec2 rt_get_vertex_uv0(uint idx)
{
	uint uv0_offset = idx * VERTEX_UV0_STRIDE_FLOATS + VERTEX_UV0_OFFSET_FLOATS;
	vec2 p = vec2(in_vtx_data[uv0_offset + 0], in_vtx_data[uv0_offset + 1]);
	return p;
}

vec4 rt_get_vertex_color(uint idx)
{
	uint color_offset = idx * VERTEX_COLOR_STRIDE_FLOATS + VERTEX_COLOR_OFFSET_FLOATS;
	uint packed_color = floatBitsToUint(in_vtx_data[color_offset + 0]);
	vec4 p = vec4((packed_color >> 24) & 0xff, (packed_color >> 16) & 0xff, (packed_color >> 8) & 0xff, (packed_color >> 0) & 0xff) * (1.0 / 255.0);
	return p;
}

// NOTE: We are currently storing material per face, so TODO: Rename these consts
int rt_get_triangle_material(uint idx)
{
	int p = int(transformed_data_faces[idx].material_idx);
	return p;
}

// ------------------------------------------------------------------------------------------------------------------
void rt_set_face_idx_material_flags(uint face_idx, uint v0, uint v1, uint v2, uint flags)
{
	uint flags0 = (flags >> 0) & 0xff;
	uint flags1 = (flags >> 8) & 0xff;
	v0 = (flags0 << 24) | v0;
	v1 = (flags1 << 24) | v1;
	transformed_data_indices[face_idx * 3 + 0] = v0;
	transformed_data_indices[face_idx * 3 + 1] = v1;
	transformed_data_indices[face_idx * 3 + 2] = v2;
}

struct RTFace
{
	uint v0;
	uint v1;
	uint v2;
	uint material_flags;
};

RTFace rt_get_face(uint face_idx)
{
	uint v0 = transformed_data_indices[face_idx * 3 + 0];
	uint v1 = transformed_data_indices[face_idx * 3 + 1];
	uint v2 = transformed_data_indices[face_idx * 3 + 2];
	uint flags0 = v0 >> 24;
	uint flags1 = v1 >> 24;
	RTFace f;
	f.v0 = v0 & 0xffffff;
	f.v1 = v1 & 0xffffff;
	f.v2 = v2;
	f.material_flags = flags0 | (flags1 << 8);
	return f;
}

uint rt_mask_vtx_idx(uint vtx_idx)
{
	return vtx_idx & 0xffffff;
}

// ------------------------------------------------------------------------------------------------------------------

vec3 rt_barycentric_xyz(vec3 p, vec3 a, vec3 b, vec3 c)
{
	vec3 bc;
	vec3 v0 = b - a, v1 = c - a, v2 = p - a;
	float d00 = dot(v0, v0);
	float d01 = dot(v0, v1);
	float d11 = dot(v1, v1);
	float d20 = dot(v2, v0);
	float d21 = dot(v2, v1);
	float denom = d00 * d11 - d01 * d01;
	bc.y = (d11 * d20 - d01 * d21) / denom;
	bc.z = (d00 * d21 - d01 * d20) / denom;
	bc.x = 1.0f - bc.z - bc.y;
	return bc;
}

vec2 rt_barycentric_yz(vec3 p, vec3 a, vec3 b, vec3 c)
{
	vec3 bc;
	vec3 v0 = b - a, v1 = c - a, v2 = p - a;
	float d00 = dot(v0, v0);
	float d01 = dot(v0, v1);
	float d11 = dot(v1, v1);
	float d20 = dot(v2, v0);
	float d21 = dot(v2, v1);
	float denom = d00 * d11 - d01 * d01;
	bc.y = (d11 * d20 - d01 * d21) / denom;
	bc.z = (d00 * d21 - d01 * d20) / denom;
	return bc.yz;
}

// ------------------------------------------------------------------------------------------------------------------
// Try Min() for color and MaX() for occlusion. This should reduce light leaking (maybe)

void rt_store_voxel_color(uint cx, uint cy, uint cz, vec4 v)
{
	uint idx = rt_calculate_grid_cell_idx(cx, cy, cz);
	uint vi;
	float v_mag = ceil(length(v.xyz));
	v.xyz = v.xyz / v_mag;
	v_mag = min(127.0, v_mag);

	// encode
	vi = uint(v.x * 255.0) | (uint(v.y * 255.0) << 8) |( uint(v.z * 255.0) << 16);
	vi |= uint(v_mag) * (1 << 24); 
	vi |= 1 << 31; // marker that it was used, 1 bit
	uint prev_vi = atomicCompSwap(in_voxel_light_data[idx], 0, vi);
	if (prev_vi != 0)
		atomicMin(in_voxel_light_data[idx], vi);
}

void rt_store_voxel_color_emissive(uint cx, uint cy, uint cz, vec4 v)
{
	uint idx = rt_calculate_grid_cell_idx(cx, cy, cz);
	uint vi;
	float v_mag = ceil(length(v.xyz));
	v.xyz = v.xyz / v_mag;
	v_mag = min(127.0, v_mag);

	// encode
	vi = uint(v.x * 255.0) | (uint(v.y * 255.0) << 8) |( uint(v.z * 255.0) << 16);
	vi |= uint(v_mag) * (1 << 24); 
	vi |= 1 << 31; // marker that it was used, 1 bit
	atomicMax(in_voxel_light_data[idx], vi);
}

void rt_store_voxel_occlusion(uint cx, uint cy, uint cz, float v)
{
	uint idx = rt_calculate_grid_cell_idx(cx, cy, cz);
	atomicMax(in_voxel_occlusion_data[idx], uint(v * 1024.0));
}

#if 0
void rt_store_voxel_normal(uint cx, uint cy, uint cz, vec3 v)
{
	uint idx = rt_calculate_grid_cell_idx(cx, cy, cz);
	uint vi = encode_normal_32bit(v);
	atomicMax(in_voxel_normal_data.normal[idx], vi);
}
#endif

vec4 rt_voxel_color_decode(uint vi)
{
	vec4 v = vec4(0.0);

	if ((vi >> 31) > 0)
	{
		uint v_mag = (vi >> 24) & 0x7f;
		float v_mag_rcp = (1.0 / 255.0) * float(v_mag);

		v.x = float((vi >> 0) & 0xff) * v_mag_rcp;
		v.y = float((vi >> 8) & 0xff) * v_mag_rcp;
		v.z = float((vi >> 16) & 0xff) * v_mag_rcp;
		v.w = 1.0;
	}
	return v;
}

vec4 rt_get_voxel_color(uint cx, uint cy, uint cz)
{
	uint idx = rt_calculate_grid_cell_idx(cx, cy, cz);
	uint vi = in_voxel_light_data[idx];
	vec4 v = rt_voxel_color_decode(vi);
	return v;
}

vec4 rt_get_voxel_color(int idx)
{
	uint vi = in_voxel_light_data[idx];
	vec4 v = rt_voxel_color_decode(vi);
	return v;
}

float rt_get_voxel_occlusion(uint cx, uint cy, uint cz)
{
	uint idx = rt_calculate_grid_cell_idx(cx, cy, cz);
	uint vi = in_voxel_occlusion_data[idx];
	float v = float(vi) / 1024.0;
	return v;
}

float rt_get_voxel_occlusion(int idx)
{
	uint vi = in_voxel_occlusion_data[idx];
	float v = float(vi) / 1024.0;
	return v;
}

// ------------------------------------------------------------------------------------------------------------------

float rt_hash(vec2 p)
{
	float h = dot(p,vec2(127.1,311.7));
	return -1.0 + 2.0*fract(sin(h)*43758.5453123);
}

vec3 randomSpherePoint(vec3 rand)
{
	float ang1 = (rand.x + 1.0) * PI; // [-1..1) -> [0..2*PI)
	float u = rand.y; // [-1..1), cos and acos(2v-1) cancel each other out, so we arrive at [-1..1)
	float u2 = u * u;
	float sqrt1MinusU2 = sqrt(1.0 - u2);
	float x = sqrt1MinusU2 * cos(ang1);
	float y = sqrt1MinusU2 * sin(ang1);
	float z = u;
	return vec3(x, y, z);
}

vec3 randomHemispherePoint(vec3 rand, vec3 n)
{
	/**
	 * Generate random sphere point and swap vector along the normal, if it
	 * points to the wrong of the two hemispheres.
	 * This method provides a uniform distribution over the hemisphere, 
	 * provided that the sphere distribution is also uniform.
	*/
	vec3 v = randomSpherePoint(rand);
	return v * sign(dot(v, n));
}

#endif // RAYTRACE_COMMONS_H