Shtille's blog A development blog

Polyline performance

Overview

I decided to check if geometry shader provides better performance for polyline rendering comparing to normal one. For test application I gonna render 3 segments as quads. The code is about to be pretty simple.

Without geometry shader

Traditional way (only vertex and fragment shaders) has been implemented previously. Let’s recall it.

Application code

Data creation

bool OldQuadDrawer::CreateData(const PointArray& points)
{
	uint32_t num_points = static_cast<uint32_t>(points.size());
	if (num_points < 2) return false;
	uint32_t num_segments = num_points - 1;
	uint32_t total_segments = num_segments + 2;

	num_vertices_ = 4 * total_segments;
	vertices_array_ = new uint8_t[num_vertices_ * sizeof(Vertex)];
	Vertex* vertices = reinterpret_cast<Vertex*>(vertices_array_);

	num_indices_ = 6 * num_segments;
	index_size_ = sizeof(uint32_t);
	indices_array_ = new uint8_t[num_indices_ * index_size_];
	uint32_t* indices = reinterpret_cast<uint32_t*>(indices_array_);

	// Positions
	uint32_t n = 0;
	Point first_point, last_point;
	first_point[0] = points[0][0] + (points[0][0] - points[1][0]);
	first_point[1] = points[0][1] + (points[0][1] - points[1][1]);
	last_point[0] = points[num_points - 1][0] + (points[num_points - 1][0] - points[num_points - 2][0]);
	last_point[1] = points[num_points - 1][1] + (points[num_points - 1][1] - points[num_points - 2][1]);
	vertices[n++].position = first_point;
	vertices[n++].position = first_point;
	vertices[n++].position = points[0];
	vertices[n++].position = points[0];
	for (uint32_t i = 0; i < num_segments; ++i)
	{
		vertices[n++].position = points[i];
		vertices[n++].position = points[i];
		vertices[n++].position = points[i+1];
		vertices[n++].position = points[i+1];
	}
	vertices[n++].position = points[num_points - 1];
	vertices[n++].position = points[num_points - 1];
	vertices[n++].position = last_point;
	vertices[n++].position = last_point;

	// Texcoords
	n = 0;
	for (uint32_t i = 0; i < total_segments; ++i)
	{
		vertices[n++].texcoord = { 0.0f,  1.0f};
		vertices[n++].texcoord = { 0.0f, -1.0f};
		vertices[n++].texcoord = { 0.0f,  1.0f};
		vertices[n++].texcoord = { 0.0f, -1.0f};
	}

	// Indices
	n = 0;
	for (uint32_t i = 0; i < num_segments; ++i)
	{
		// 0-1-2
		indices[n++] = 4*i+0;
		indices[n++] = 4*i+1;
		indices[n++] = 4*i+2;
		// 2-1-3
		indices[n++] = 4*i+2;
		indices[n++] = 4*i+1;
		indices[n++] = 4*i+3;
	}

	return true;
}

Attributes layout

	const GLsizei stride = sizeof(Vertex);
	const uint8_t* base = nullptr;
	const uint8_t* curr_offset = base + 4*stride; // offset to start of actual current data
	const uint8_t* prev_offset = curr_offset - 2*stride;
	const uint8_t* next_offset = curr_offset + 2*stride;
	const uint8_t* texcoord_offset = curr_offset + sizeof(Point);
	glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, stride, prev_offset); // vec2 a_prev
	glEnableVertexAttribArray(0);
	glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, stride, curr_offset); // vec2 a_curr
	glEnableVertexAttribArray(1);
	glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, stride, next_offset); // vec2 a_next
	glEnableVertexAttribArray(2);
	glVertexAttribPointer(3, 2, GL_FLOAT, GL_FALSE, stride, texcoord_offset); // vec2 a_texcoord;
	glEnableVertexAttribArray(3);

Rendering

void OldQuadDrawer::Render()
{
	ActivateShader();

	glBindVertexArray(vertex_array_object_);
	glDrawElements(GL_TRIANGLES, num_indices_, GL_UNSIGNED_INT, nullptr);
	glBindVertexArray(0);

	DeactivateShader();
}

Shader code

Vertex shader

#version 330 core

layout(location = 0) in vec2 a_prev;
layout(location = 1) in vec2 a_curr;
layout(location = 2) in vec2 a_next;
layout(location = 3) in vec2 a_texcoord;

uniform vec4 u_viewport;
uniform float u_width;

vec2 project(vec4 clip)
{
	vec3 ndc = clip.xyz / clip.w;
	vec2 screen = (ndc.xy * 0.5 + vec2(0.5)) * u_viewport.zw + u_viewport.xy;
	return screen;
}
vec4 unproject(vec2 screen, float z, float w)
{
	vec2 ndc = ((screen - u_viewport.xy) / u_viewport.zw - vec2(0.5)) * 2.0;
	return vec4(ndc.x * w, ndc.y * w, z, w);
}

void main()
{
	vec4 clip_curr = vec4(a_curr, 0.0, 1.0);
	vec4 clip_prev = vec4(a_prev, 0.0, 1.0);
	vec4 clip_next = vec4(a_next, 0.0, 1.0);

	vec2 curr = project(clip_curr);
	vec2 prev = project(clip_prev);
	vec2 next = project(clip_next);

	vec2 direction = normalize(next - prev);
	vec2 normal = vec2(-direction.y, direction.x);

	float w = u_width * 0.5;
	vec2 offset_x = direction * (a_texcoord.x * w);
	vec2 offset_y = normal    * (a_texcoord.y * w);
	vec2 screen = curr + offset_x + offset_y;

	gl_Position = unproject(screen, clip_curr.z, clip_curr.w);
}

Fragment shader

#version 330 core

out vec4 color;

void main()
{
	color = vec4(1.0, 0.0, 0.0, 0.0);
}

With geometry shader

Geometry shader is more optimal memory-wise.

Application code

Data creation

bool QuadDrawer::CreateData(const PointArray& points)
{
	uint32_t num_points = static_cast<uint32_t>(points.size());
	if (num_points < 2) return false;

	num_vertices_ = num_points;
	vertices_array_ = new uint8_t[num_vertices_ * sizeof(Vertex)];
	Vertex* vertices = reinterpret_cast<Vertex*>(vertices_array_);

	for (uint32_t i = 0; i < num_vertices_; ++i)
	{
		Vertex& vertex = vertices[i];
		const Point& point = points[i];

		vertex.position = {point[0], point[1]};
	}

	return true;
}

Attributes layout

	const GLsizei stride = sizeof(Vertex);
	const uint8_t* base = nullptr;
	const uint8_t* curr_offset = base;
	const uint8_t* next_offset = curr_offset + stride;
	glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, stride, curr_offset); // vec2 a_position_curr
	glEnableVertexAttribArray(0);
	glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, stride, next_offset); // vec2 a_position_next
	glEnableVertexAttribArray(1);

Rendering

void QuadDrawer::Render()
{
	ActivateShader();

	glBindVertexArray(vertex_array_object_);
	glDrawArrays(GL_POINTS, 0, num_vertices_-1);
	glBindVertexArray(0);

	DeactivateShader();
}

Shader code

Vertex shader

#version 330 core

layout (location = 0) in vec2 a_position_curr;
layout (location = 1) in vec2 a_position_next;

out VS_OUT {
	vec4 position_next;
} vs_out;

void main()
{
	gl_Position = vec4(a_position_curr, 0.0, 1.0);
	vs_out.position_next = vec4(a_position_next, 0.0, 1.0);
}

Geometry shader

#version 330 core

layout (points) in;
layout (triangle_strip, max_vertices = 4) out;

uniform vec4 u_viewport;
uniform float u_pixel_width;

in VS_OUT {
	vec4 position_next;
} gs_in[];

vec2 project(vec4 clip)
{
    vec3 ndc = clip.xyz / clip.w;
    vec2 screen = (ndc.xy * 0.5 + vec2(0.5)) * u_viewport.zw + u_viewport.xy;
    return screen;
}
vec4 unproject(vec2 screen, float z, float w)
{
    vec2 ndc = ((screen - u_viewport.xy) / u_viewport.zw - vec2(0.5)) * 2.0;
    return vec4(ndc.x * w, ndc.y * w, z, w);
}

void build_quad(vec4 current, vec4 next)
{
	vec2 screen_current = project(current);
	vec2 screen_next = project(next);

	vec2 direction = normalize(screen_next - screen_current);
	vec2 normal = vec2(-direction.y, direction.x);

	float w = u_pixel_width * 0.5;

	gl_Position = unproject(screen_current + normal * w, current.z, current.w); // left top
	EmitVertex();   
	gl_Position = unproject(screen_current - normal * w, current.z, current.w); // left bottom
	EmitVertex();
	gl_Position = unproject(screen_next + normal * w, next.z, next.w); // right top
	EmitVertex();   
	gl_Position = unproject(screen_next - normal * w, next.z, next.w); // right bottom
	EmitVertex();
	EndPrimitive();
}

void main()
{    
	build_quad(gl_in[0].gl_Position, gs_in[0].position_next);
}

Fragment shader

#version 330 core

out vec4 color;

void main()
{
	color = vec4(1.0, 0.0, 0.0, 0.0);
}

Performance capture

For performance capture we use glBeginQuery - glEndQuery block:

void TestIteration()
{
	poly::TimeElapsedQuery query;

	std::cout << "--- iteration begin" << std::endl;
	{
		query.Begin();
		old_drawer->Render();
		query.End();
		if (query.GetResult(true))
		{
			std::cout << "old took " << query.GetElapsedTime() << " ns" << std::endl;
		}
	}
	{
		query.Begin();
		new_drawer->Render();
		query.End();
		if (query.GetResult(true))
		{
			std::cout << "new took " << query.GetElapsedTime() << " ns" << std::endl;
		}
	}
	std::cout << "--- iteration end" << std::endl;
}

Results

For three iterations we get following results:

--- iteration begin
old took 21268 ns
new took 13936 ns
--- iteration end
--- iteration begin
old took 19760 ns
new took 13312 ns
--- iteration end
--- iteration begin
old took 18616 ns
new took 13312 ns
--- iteration end

So quad drawer with geometry shader is about 1.5 times faster than a version without geometry shader.

Conclusion

We can conclude that geometry shader usage provides better performance for polyline rendering.