diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9af325791..dcabeb2a0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1245,6 +1245,7 @@ set (PCH_SOURCES
 	sound/musicformats/music_cd.cpp
 	sound/musicformats/music_dumb.cpp
 	sound/musicformats/music_gme.cpp
+	sound/musicformats/music_libsndfile.cpp
 	sound/musicformats/music_mus_midiout.cpp
 	sound/musicformats/music_smf_midiout.cpp
 	sound/musicformats/music_hmi_midiout.cpp
diff --git a/src/gl/system/gl_swframebuffer.cpp b/src/gl/system/gl_swframebuffer.cpp
index 3bb0fd8d8..ee5c0d58d 100644
--- a/src/gl/system/gl_swframebuffer.cpp
+++ b/src/gl/system/gl_swframebuffer.cpp
@@ -1095,7 +1095,7 @@ bool OpenGLSWFrameBuffer::IsValid()
 
 bool OpenGLSWFrameBuffer::Lock(bool buffered)
 {
-	if (LockCount++ > 0)
+	if (m_Lock++ > 0)
 	{
 		return false;
 	}
@@ -1130,16 +1130,16 @@ bool OpenGLSWFrameBuffer::Lock(bool buffered)
 
 void OpenGLSWFrameBuffer::Unlock()
 {
-	if (LockCount == 0)
+	if (m_Lock == 0)
 	{
 		return;
 	}
 
-	if (UpdatePending && LockCount == 1)
+	if (UpdatePending && m_Lock == 1)
 	{
 		Update();
 	}
-	else if (--LockCount == 0)
+	else if (--m_Lock == 0)
 	{
 		Buffer = nullptr;
 	}
@@ -1171,13 +1171,13 @@ void OpenGLSWFrameBuffer::Update()
 		return;
 	}
 
-	if (LockCount != 1)
+	if (m_Lock != 1)
 	{
 		I_FatalError("Framebuffer must have exactly 1 lock to be updated");
-		if (LockCount > 0)
+		if (m_Lock > 0)
 		{
 			UpdatePending = true;
-			--LockCount;
+			--m_Lock;
 		}
 		return;
 	}
@@ -1220,7 +1220,7 @@ void OpenGLSWFrameBuffer::Update()
 	BlitCycles.Clock();
 #endif
 
-	LockCount = 0;
+	m_Lock = 0;
 	Draw3DPart(In2D <= 1);
 	if (In2D == 0)
 	{
@@ -1276,7 +1276,7 @@ void OpenGLSWFrameBuffer::Flip()
 
 bool OpenGLSWFrameBuffer::PaintToWindow()
 {
-	if (LockCount != 0)
+	if (m_Lock != 0)
 	{
 		return false;
 	}
@@ -1662,7 +1662,7 @@ void OpenGLSWFrameBuffer::GetScreenshotBuffer(const uint8_t *&buffer, int &pitch
 
 void OpenGLSWFrameBuffer::ReleaseScreenshotBuffer()
 {
-	if (LockCount > 0)
+	if (m_Lock > 0)
 	{
 		Super::ReleaseScreenshotBuffer();
 	}
@@ -2410,7 +2410,7 @@ bool OpenGLSWFrameBuffer::OpenGLPal::Update()
 
 bool OpenGLSWFrameBuffer::Begin2D(bool copy3d)
 {
-	ClearClipRect();
+	Super::Begin2D(copy3d);
 	if (!Accel2D)
 	{
 		return false;
diff --git a/src/polyrenderer/drawers/poly_buffer.cpp b/src/polyrenderer/drawers/poly_buffer.cpp
index e8d37af4c..143d16bbd 100644
--- a/src/polyrenderer/drawers/poly_buffer.cpp
+++ b/src/polyrenderer/drawers/poly_buffer.cpp
@@ -48,7 +48,8 @@ void PolySubsectorGBuffer::Resize(int newwidth, int newheight)
 {
 	width = newwidth;
 	height = newheight;
-	values.resize(width * height);
+	int count = BlockWidth() * BlockHeight();
+	values.resize(count * 64);
 }
 
 /////////////////////////////////////////////////////////////////////////////
diff --git a/src/polyrenderer/drawers/poly_buffer.h b/src/polyrenderer/drawers/poly_buffer.h
index c272a7710..a376cec46 100644
--- a/src/polyrenderer/drawers/poly_buffer.h
+++ b/src/polyrenderer/drawers/poly_buffer.h
@@ -33,6 +33,8 @@ public:
 	void Resize(int newwidth, int newheight);
 	int Width() const { return width; }
 	int Height() const { return height; }
+	int BlockWidth() const { return (width + 7) / 8; }
+	int BlockHeight() const { return (height + 7) / 8; }
 	uint32_t *Values() { return values.data(); }
 
 private:
diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp
index ea34c175c..f8c300dc7 100644
--- a/src/polyrenderer/drawers/screen_triangle.cpp
+++ b/src/polyrenderer/drawers/screen_triangle.cpp
@@ -96,6 +96,15 @@ private:
 	__m128i mFDX12;
 	__m128i mFDX23;
 	__m128i mFDX31;
+	__m128i mC1;
+	__m128i mC2;
+	__m128i mC3;
+	__m128i mDX12;
+	__m128i mDY12;
+	__m128i mDX23;
+	__m128i mDY23;
+	__m128i mDX31;
+	__m128i mDY31;
 #endif
 
 	void CoverageTest();
@@ -124,7 +133,7 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
 
 	subsectorGBuffer = args->subsectorGBuffer;
 	subsectorDepth = args->uniforms->SubsectorDepth();
-	subsectorPitch = args->pitch;
+	subsectorPitch = args->stencilPitch;
 
 	// 28.4 fixed-point coordinates
 #ifdef NO_SSE
@@ -203,6 +212,15 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
 	mFDX12 = _mm_set1_epi32(FDX12);
 	mFDX23 = _mm_set1_epi32(FDX23);
 	mFDX31 = _mm_set1_epi32(FDX31);
+	mC1 = _mm_set1_epi32(C1);
+	mC2 = _mm_set1_epi32(C2);
+	mC3 = _mm_set1_epi32(C3);
+	mDX12 = _mm_set1_epi32(DX12);
+	mDY12 = _mm_set1_epi32(DY12);
+	mDX23 = _mm_set1_epi32(DX23);
+	mDY23 = _mm_set1_epi32(DY23);
+	mDX31 = _mm_set1_epi32(DX31);
+	mDY31 = _mm_set1_epi32(DY31);
 #endif
 }
 
@@ -270,29 +288,24 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre
 
 void TriangleBlock::SubsectorTest()
 {
-	uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch;
+	int block = (X >> 3) + (Y >> 3) * subsectorPitch;
+	uint32_t *subsector = subsectorGBuffer + block * 64;
 	uint32_t mask0 = 0;
 	uint32_t mask1 = 0;
 
-	for (int iy = 0; iy < 4; iy++)
+	for (int i = 0; i < 32; i++)
 	{
-		for (int ix = 0; ix < q; ix++)
-		{
-			bool covered = subsector[ix] >= subsectorDepth;
-			mask0 <<= 1;
-			mask0 |= (uint32_t)covered;
-		}
-		subsector += subsectorPitch;
+		bool covered = *subsector >= subsectorDepth;
+		mask0 <<= 1;
+		mask0 |= (uint32_t)covered;
+		subsector++;
 	}
-	for (int iy = 4; iy < q; iy++)
+	for (int i = 0; i < 32; i++)
 	{
-		for (int ix = 0; ix < q; ix++)
-		{
-			bool covered = subsector[ix] >= subsectorDepth;
-			mask1 <<= 1;
-			mask1 |= (uint32_t)covered;
-		}
-		subsector += subsectorPitch;
+		bool covered = *subsector >= subsectorDepth;
+		mask1 <<= 1;
+		mask1 |= (uint32_t)covered;
+		subsector++;
 	}
 
 	Mask0 = Mask0 & mask0;
@@ -303,27 +316,24 @@ void TriangleBlock::SubsectorTest()
 
 void TriangleBlock::SubsectorTest()
 {
-	uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch;
+	int block = (X >> 3) + (Y >> 3) * subsectorPitch;
+	uint32_t *subsector = subsectorGBuffer + block * 64;
 	uint32_t mask0 = 0;
 	uint32_t mask1 = 0;
 	__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
 	__m128i mnotxor = _mm_set1_epi32(0xffffffff);
 
-	for (int iy = 0; iy < 4; iy++)
+	for (int iy = 0; iy < 8; iy++)
 	{
 		mask0 <<= 4;
 		mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
-		mask0 <<= 4;
-		mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
-		subsector += subsectorPitch;
+		subsector += 4;
 	}
-	for (int iy = 4; iy < q; iy++)
+	for (int iy = 0; iy < 8; iy++)
 	{
 		mask1 <<= 4;
 		mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
-		mask1 <<= 4;
-		mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
-		subsector += subsectorPitch;
+		subsector += 4;
 	}
 
 	Mask0 = Mask0 & mask0;
@@ -432,30 +442,44 @@ void TriangleBlock::StencilEqualTest()
 		uint32_t mask0 = 0;
 		uint32_t mask1 = 0;
 
-		for (int iy = 0; iy < 4; iy++)
+		for (int iy = 0; iy < 2; iy++)
 		{
-			__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
-			__m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue);
+			__m128i mstencilBlock = _mm_loadu_si128((const __m128i *)stencilBlock);
+
+			__m128i mstencilTest = _mm_cmpeq_epi16(_mm_unpacklo_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
 			__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
 			__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
+			__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
 
-			mask0 <<= 4;
-			mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))));
-			mask0 <<= 4;
-			mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3))));
+			mstencilTest = _mm_cmpeq_epi16(_mm_unpackhi_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
+			mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
+			mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
+			__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
+
+			mask0 <<= 16;
+			mask0 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
+
+			stencilBlock += 16;
 		}
 
-		for (int iy = 4; iy < q; iy++)
+		for (int iy = 0; iy < 2; iy++)
 		{
-			__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
-			__m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue);
+			__m128i mstencilBlock = _mm_loadu_si128((const __m128i *)stencilBlock);
+
+			__m128i mstencilTest = _mm_cmpeq_epi16(_mm_unpacklo_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
 			__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
 			__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
+			__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
 
-			mask1 <<= 4;
-			mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))));
-			mask1 <<= 4;
-			mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3))));
+			mstencilTest = _mm_cmpeq_epi16(_mm_unpackhi_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
+			mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
+			mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
+			__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
+
+			mask1 <<= 16;
+			mask1 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
+
+			stencilBlock += 16;
 		}
 
 		Mask0 = Mask0 & mask0;
@@ -617,50 +641,46 @@ void TriangleBlock::CoverageTest()
 	int y0 = Y << 4;
 	int y1 = (Y + q - 1) << 4;
 
+	__m128i mY = _mm_set_epi32(y0, y0, y1, y1);
+	__m128i mX = _mm_set_epi32(x0, x0, x1, x1);
+
 	// Evaluate half-space functions
-	bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0;
-	bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0;
-	bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0;
-	bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0;
-	int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3);
+	__m128i mCY1 = _mm_sub_epi32(
+		_mm_add_epi32(mC1, _mm_shuffle_epi32(_mm_mul_epu32(mDX12, mY), _MM_SHUFFLE(0, 0, 2, 2))),
+		_mm_shuffle_epi32(_mm_mul_epu32(mDY12, mX), _MM_SHUFFLE(0, 2, 0, 2)));
+	__m128i mA = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
 
-	bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0;
-	bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0;
-	bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0;
-	bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0;
-	int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3);
+	__m128i mCY2 = _mm_sub_epi32(
+		_mm_add_epi32(mC2, _mm_shuffle_epi32(_mm_mul_epu32(mDX23, mY), _MM_SHUFFLE(0, 0, 2, 2))),
+		_mm_shuffle_epi32(_mm_mul_epu32(mDY23, mX), _MM_SHUFFLE(0, 2, 0, 2)));
+	__m128i mB = _mm_cmpgt_epi32(mCY2, _mm_setzero_si128());
 
-	bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0;
-	bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0;
-	bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0;
-	bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0;
-	int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3);
+	__m128i mCY3 = _mm_sub_epi32(
+		_mm_add_epi32(mC3, _mm_shuffle_epi32(_mm_mul_epu32(mDX31, mY), _MM_SHUFFLE(0, 0, 2, 2))),
+		_mm_shuffle_epi32(_mm_mul_epu32(mDY31, mX), _MM_SHUFFLE(0, 2, 0, 2)));
+	__m128i mC = _mm_cmpgt_epi32(mCY3, _mm_setzero_si128());
 
-	if (a == 0 || b == 0 || c == 0) // Skip block when outside an edge
+	int abc = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(mA, mB), _mm_packs_epi32(mC, _mm_setzero_si128())));
+
+	if ((abc & 0xf) == 0 || (abc & 0xf0) == 0 || (abc & 0xf00) == 0) // Skip block when outside an edge
 	{
 		Mask0 = 0;
 		Mask1 = 0;
 	}
-	else if (a == 0xf && b == 0xf && c == 0xf) // Accept whole block when totally covered
+	else if (abc == 0xfff) // Accept whole block when totally covered
 	{
 		Mask0 = 0xffffffff;
 		Mask1 = 0xffffffff;
 	}
 	else // Partially covered block
 	{
-		x0 = X << 4;
-		x1 = (X + q - 1) << 4;
-		int CY1 = C1 + DX12 * y0 - DY12 * x0;
-		int CY2 = C2 + DX23 * y0 - DY23 * x0;
-		int CY3 = C3 + DX31 * y0 - DY31 * x0;
-
 		uint32_t mask0 = 0;
 		uint32_t mask1 = 0;
 
-		__m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset);
-		__m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset);
-		__m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset);
-		for (int iy = 0; iy < 4; iy++)
+		mCY1 = _mm_sub_epi32(_mm_shuffle_epi32(mCY1, _MM_SHUFFLE(0, 0, 0, 0)), mFDY12Offset);
+		mCY2 = _mm_sub_epi32(_mm_shuffle_epi32(mCY2, _MM_SHUFFLE(0, 0, 0, 0)), mFDY23Offset);
+		mCY3 = _mm_sub_epi32(_mm_shuffle_epi32(mCY3, _MM_SHUFFLE(0, 0, 0, 0)), mFDY31Offset);
+		for (int iy = 0; iy < 2; iy++)
 		{
 			__m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
 			mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
@@ -668,18 +688,27 @@ void TriangleBlock::CoverageTest()
 			__m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
 			mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
 			mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
-
 			mCY1 = _mm_add_epi32(mCY1, mFDX12);
 			mCY2 = _mm_add_epi32(mCY2, mFDX23);
 			mCY3 = _mm_add_epi32(mCY3, mFDX31);
+			__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
 
-			mask0 <<= 4;
-			mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
-			mask0 <<= 4;
-			mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
+			mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
+			mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
+			mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
+			mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
+			mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
+			mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
+			mCY1 = _mm_add_epi32(mCY1, mFDX12);
+			mCY2 = _mm_add_epi32(mCY2, mFDX23);
+			mCY3 = _mm_add_epi32(mCY3, mFDX31);
+			__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
+
+			mask0 <<= 16;
+			mask0 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
 		}
 
-		for (int iy = 4; iy < q; iy++)
+		for (int iy = 0; iy < 2; iy++)
 		{
 			__m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
 			mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
@@ -687,15 +716,24 @@ void TriangleBlock::CoverageTest()
 			__m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
 			mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
 			mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
-
 			mCY1 = _mm_add_epi32(mCY1, mFDX12);
 			mCY2 = _mm_add_epi32(mCY2, mFDX23);
 			mCY3 = _mm_add_epi32(mCY3, mFDX31);
+			__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
 
-			mask1 <<= 4;
-			mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
-			mask1 <<= 4;
-			mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
+			mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
+			mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
+			mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
+			mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
+			mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
+			mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
+			mCY1 = _mm_add_epi32(mCY1, mFDX12);
+			mCY2 = _mm_add_epi32(mCY2, mFDX23);
+			mCY3 = _mm_add_epi32(mCY3, mFDX31);
+			__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
+
+			mask1 <<= 16;
+			mask1 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
 		}
 
 		Mask0 = mask0;
@@ -755,47 +793,98 @@ void TriangleBlock::StencilWrite()
 	}
 }
 
+#ifdef NO_SSE
+
 void TriangleBlock::SubsectorWrite()
 {
-	auto pitch = subsectorPitch;
-	uint32_t *subsector = subsectorGBuffer + X + Y * pitch;
+	int block = (X >> 3) + (Y >> 3) * subsectorPitch;
+	uint32_t *subsector = subsectorGBuffer + block * 64;
 
 	if (Mask0 == 0xffffffff && Mask1 == 0xffffffff)
 	{
-		for (int y = 0; y < 8; y++)
+		for (int i = 0; i < 64; i++)
 		{
-			for (int x = 0; x < 8; x++)
-				subsector[x] = subsectorDepth;
-			subsector += pitch;
+			*(subsector++) = subsectorDepth;
 		}
 	}
 	else
 	{
 		uint32_t mask0 = Mask0;
 		uint32_t mask1 = Mask1;
-		for (int y = 0; y < 4; y++)
+		for (int i = 0; i < 32; i++)
 		{
-			for (int x = 0; x < 8; x++)
-			{
-				if (mask0 & (1 << 31))
-					subsector[x] = subsectorDepth;
-				mask0 <<= 1;
-			}
-			subsector += pitch;
+			if (mask0 & (1 << 31))
+				*subsector = subsectorDepth;
+			mask0 <<= 1;
+			subsector++;
 		}
-		for (int y = 4; y < 8; y++)
+		for (int i = 0; i < 32; i++)
 		{
-			for (int x = 0; x < 8; x++)
-			{
-				if (mask1 & (1 << 31))
-					subsector[x] = subsectorDepth;
-				mask1 <<= 1;
-			}
-			subsector += pitch;
+			if (mask1 & (1 << 31))
+				*subsector = subsectorDepth;
+			mask1 <<= 1;
+			subsector++;
 		}
 	}
 }
 
+#else
+
+void TriangleBlock::SubsectorWrite()
+{
+	int block = (X >> 3) + (Y >> 3) * subsectorPitch;
+	uint32_t *subsector = subsectorGBuffer + block * 64;
+	__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
+
+	if (Mask0 == 0xffffffff && Mask1 == 0xffffffff)
+	{
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
+		_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
+	}
+	else
+	{
+		__m128i mxormask = _mm_set1_epi32(0xffffffff);
+		__m128i topfour = _mm_setr_epi32(1 << 31, 1 << 30, 1 << 29, 1 << 28);
+
+		__m128i mmask0 = _mm_set1_epi32(Mask0);
+		__m128i mmask1 = _mm_set1_epi32(Mask1);
+
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); subsector += 4;
+
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
+		_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); subsector += 4;
+	}
+}
+
+#endif
+
 void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread)
 {
 	TriangleBlock block(args);
diff --git a/src/sound/i_music.cpp b/src/sound/i_music.cpp
index bf3ca81de..e00fdae4a 100644
--- a/src/sound/i_music.cpp
+++ b/src/sound/i_music.cpp
@@ -488,6 +488,11 @@ retry_as_sndsys:
 	{
 		info = MOD_OpenSong(*reader);
 	}
+	if (info == nullptr)
+	{
+		info = SndFile_OpenSong(*reader);
+		if (info != nullptr) reader = nullptr;
+	}
 
     if (info == NULL)
     {
diff --git a/src/sound/i_musicinterns.h b/src/sound/i_musicinterns.h
index ba69741a5..227464c11 100644
--- a/src/sound/i_musicinterns.h
+++ b/src/sound/i_musicinterns.h
@@ -672,6 +672,7 @@ MusInfo *MOD_OpenSong(FileReader &reader);
 
 const char *GME_CheckFormat(uint32_t header);
 MusInfo *GME_OpenSong(FileReader &reader, const char *fmt);
+MusInfo *SndFile_OpenSong(FileReader &fr);
 
 // --------------------------------------------------------------------------
 
diff --git a/src/sound/i_sound.h b/src/sound/i_sound.h
index cd58c6f64..5f1917d68 100644
--- a/src/sound/i_sound.h
+++ b/src/sound/i_sound.h
@@ -160,8 +160,7 @@ public:
 
 	virtual MIDIDevice* CreateMIDIDevice() const = 0;
 
-protected:
-    virtual SoundDecoder *CreateDecoder(FileReader *reader);
+    static SoundDecoder *CreateDecoder(FileReader *reader);
 };
 
 extern SoundRenderer *GSnd;
diff --git a/src/sound/i_soundinternal.h b/src/sound/i_soundinternal.h
index 786ff5dca..bb154a195 100644
--- a/src/sound/i_soundinternal.h
+++ b/src/sound/i_soundinternal.h
@@ -132,7 +132,7 @@ struct SoundDecoder
 
     virtual size_t read(char *buffer, size_t bytes) = 0;
     virtual TArray<char> readAll();
-    virtual bool seek(size_t ms_offset) = 0;
+    virtual bool seek(size_t ms_offset, bool ms) = 0;
     virtual size_t getSampleOffset() = 0;
     virtual size_t getSampleLength() { return 0; }
 
diff --git a/src/sound/mpg123_decoder.cpp b/src/sound/mpg123_decoder.cpp
index 605970bc9..1aa5a0e2f 100644
--- a/src/sound/mpg123_decoder.cpp
+++ b/src/sound/mpg123_decoder.cpp
@@ -134,14 +134,14 @@ size_t MPG123Decoder::read(char *buffer, size_t bytes)
     return amt;
 }
 
-bool MPG123Decoder::seek(size_t ms_offset)
+bool MPG123Decoder::seek(size_t ms_offset, bool ms)
 {
     int enc, channels;
     long srate;
 
     if(mpg123_getformat(MPG123, &srate, &channels, &enc) == MPG123_OK)
     {
-        size_t smp_offset = (size_t)((double)ms_offset / 1000. * srate);
+        size_t smp_offset = ms? (size_t)((double)ms_offset / 1000. * srate) : ms_offset;
         if(mpg123_seek(MPG123, (off_t)smp_offset, SEEK_SET) >= 0)
         {
             Done = false;
diff --git a/src/sound/mpg123_decoder.h b/src/sound/mpg123_decoder.h
index 59e1df2ca..e1c786888 100644
--- a/src/sound/mpg123_decoder.h
+++ b/src/sound/mpg123_decoder.h
@@ -16,7 +16,7 @@ struct MPG123Decoder : public SoundDecoder
     virtual void getInfo(int *samplerate, ChannelConfig *chans, SampleType *type);
 
     virtual size_t read(char *buffer, size_t bytes);
-    virtual bool seek(size_t ms_offset);
+    virtual bool seek(size_t ms_offset, bool ms);
     virtual size_t getSampleOffset();
     virtual size_t getSampleLength();
 
diff --git a/src/sound/musicformats/music_libsndfile.cpp b/src/sound/musicformats/music_libsndfile.cpp
new file mode 100644
index 000000000..a338d344f
--- /dev/null
+++ b/src/sound/musicformats/music_libsndfile.cpp
@@ -0,0 +1,341 @@
+/*
+** music_libsndfile.cpp
+** Uses libsndfile for streaming music formats
+**
+**---------------------------------------------------------------------------
+** Copyright 2017 Christoph Oelckers
+** All rights reserved.
+**
+** Redistribution and use in source and binary forms, with or without
+** modification, are permitted provided that the following conditions
+** are met:
+**
+** 1. Redistributions of source code must retain the above copyright
+**    notice, this list of conditions and the following disclaimer.
+** 2. Redistributions in binary form must reproduce the above copyright
+**    notice, this list of conditions and the following disclaimer in the
+**    documentation and/or other materials provided with the distribution.
+** 3. The name of the author may not be used to endorse or promote products
+**    derived from this software without specific prior written permission.
+**
+** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**---------------------------------------------------------------------------
+**
+*/
+
+// HEADER FILES ------------------------------------------------------------
+
+#include "i_musicinterns.h"
+#include "c_cvars.h"
+#include "critsec.h"
+#include "v_text.h"
+#include "files.h"
+#include "templates.h"
+#include "sndfile_decoder.h"
+#include "mpg123_decoder.h"
+#include "m_fixed.h"
+
+// MACROS ------------------------------------------------------------------
+
+// TYPES -------------------------------------------------------------------
+
+class SndFileSong : public StreamSong
+{
+public:
+	SndFileSong(FileReader *reader, SoundDecoder *decoder, uint32_t loop_start, uint32_t loop_end, bool startass, bool endass);
+	~SndFileSong();
+	bool SetSubsong(int subsong);
+	void Play(bool looping, int subsong);
+	FString GetStats();
+	
+protected:
+	FCriticalSection CritSec;
+	FileReader *Reader;
+	SoundDecoder *Decoder;
+	int Channels;
+	int SampleRate;
+	
+	uint32_t Loop_Start;
+	uint32_t Loop_End;
+
+	int CalcSongLength();
+
+	static bool Read(SoundStream *stream, void *buff, int len, void *userdata);
+};
+
+// EXTERNAL FUNCTION PROTOTYPES --------------------------------------------
+
+// PUBLIC FUNCTION PROTOTYPES ----------------------------------------------
+
+// PRIVATE FUNCTION PROTOTYPES ---------------------------------------------
+
+// EXTERNAL DATA DECLARATIONS ----------------------------------------------
+
+// PUBLIC DATA DEFINITIONS -------------------------------------------------
+
+// PRIVATE DATA DEFINITIONS ------------------------------------------------
+
+// CODE --------------------------------------------------------------------
+
+//==========================================================================
+//
+// try to find the LOOP_START/LOOP_END tags
+//
+// This is a brute force implementation, thanks in no snall part
+// that no decent documentation of Ogg headers seems to exist and
+// all available tag libraries are horrendously bloated.
+// So if we want to do this without any new third party dependencies,
+// thanks to the lack of anything that would help to do this properly,
+// this was the only solution.
+//
+//==========================================================================
+
+void FindLoopTags(FileReader *fr, uint32_t *start, bool *startass, uint32_t *end, bool *endass)
+{
+	unsigned char testbuf[256];
+
+	fr->Seek(0, SEEK_SET);
+	long got = fr->Read(testbuf, 256);
+	auto eqp = testbuf - 1;
+	int count;
+	while(true)
+	{
+		unsigned char *c = (unsigned char *)memchr(eqp + 1, '=', 256 - (eqp + 1 - testbuf));
+		if (c == nullptr) return;	// If there is no '=' in the first 256 bytes there's also no metadata.
+
+		eqp = c;
+		while (*c >= 32 && *c < 127) c--;
+		if (*c != 0)
+		{
+			// doesn't look like a valid tag, so try again
+			continue;
+		}
+		c -= 3;
+		int len = LittleLong(*(int*)c);
+		if (len > 1000000 || len <= (eqp - c + 1))
+		{
+			// length looks fishy so retry with the next '='
+			continue;
+		}
+		c -= 4;
+		count = LittleLong(*(int*)c);
+		if (count <= 0 || count > 1000)
+		{
+			// very unlikely to have 1000 tags
+			continue;
+		}
+		c += 4;
+		fr->Seek(long(c - testbuf), SEEK_SET);
+		break;	// looks like we found something.
+	}
+	for (int i = 0; i < count; i++)
+	{
+		int length = 0;
+		fr->Read(&length, 4);
+		length = LittleLong(length);
+		if (length == 0 || length > 1000000) return;	// looks like we lost it...
+		if (length > 25)
+		{
+			// This tag is too long to be a valid time stamp so don't even bother.
+			fr->Seek(length, SEEK_CUR);
+			continue;
+		}
+		fr->Read(testbuf, length);
+		testbuf[length] = 0;
+		if (strnicmp((char*)testbuf, "LOOP_START=", 11) == 0)
+		{
+			S_ParseTimeTag((char*)testbuf + 11, startass, start);
+		}
+		else if (strnicmp((char*)testbuf, "LOOP_END=", 9) == 0)
+		{
+			S_ParseTimeTag((char*)testbuf + 9, endass, end);
+		}
+	}
+}
+
+//==========================================================================
+//
+// SndFile_OpenSong
+//
+//==========================================================================
+
+MusInfo *SndFile_OpenSong(FileReader &fr)
+{
+	uint8_t signature[4];
+	
+	fr.Seek(0, SEEK_SET);
+	fr.Read(signature, 4);
+	uint32_t loop_start = 0, loop_end = ~0u;
+	bool startass = false, endass = false;
+	
+	if (!memcmp(signature, "OggS", 4) || !memcmp(signature, "fLaC", 4))
+	{
+		// Todo: Read loop points from metadata
+		FindLoopTags(&fr, &loop_start, &startass, &loop_end, &endass);
+	}
+	fr.Seek(0, SEEK_SET);
+	auto decoder = SoundRenderer::CreateDecoder(&fr);
+	if (decoder == nullptr) return nullptr;
+	return new SndFileSong(&fr, decoder, loop_start, loop_end, startass, endass);
+}
+
+//==========================================================================
+//
+// SndFileSong - Constructor
+//
+//==========================================================================
+
+SndFileSong::SndFileSong(FileReader *reader, SoundDecoder *decoder, uint32_t loop_start, uint32_t loop_end, bool startass, bool endass)
+{
+	ChannelConfig iChannels;
+	SampleType Type;
+	
+	decoder->getInfo(&SampleRate, &iChannels, &Type);
+
+	if (!startass) loop_start = Scale(loop_start, SampleRate, 1000);
+	if (!endass) loop_end = Scale(loop_end, SampleRate, 1000);
+
+	Loop_Start = loop_start;
+	Loop_End = clamp<uint32_t>(loop_end, 0, (uint32_t)decoder->getSampleLength());
+	Reader = reader;
+	Decoder = decoder;
+	Channels = iChannels == ChannelConfig_Stereo? 2:1;
+	m_Stream = GSnd->CreateStream(Read, 32*1024, iChannels == ChannelConfig_Stereo? 0 : SoundStream::Mono, SampleRate, this);
+}
+
+//==========================================================================
+//
+// SndFileSong - Destructor
+//
+//==========================================================================
+
+SndFileSong::~SndFileSong()
+{
+	Stop();
+	if (m_Stream != nullptr)
+	{
+		delete m_Stream;
+		m_Stream = nullptr;
+	}
+	if (Decoder != nullptr)
+	{
+		delete Decoder;
+	}
+	if (Reader != nullptr)
+	{
+		delete Reader;
+	}
+}
+
+
+//==========================================================================
+//
+// SndFileSong :: Play
+//
+//==========================================================================
+
+void SndFileSong::Play(bool looping, int track)
+{
+	m_Status = STATE_Stopped;
+	m_Looping = looping;
+	if (m_Stream->Play(looping, 1))
+	{
+		m_Status = STATE_Playing;
+	}
+}
+
+//==========================================================================
+//
+// SndFileSong :: SetSubsong
+//
+//==========================================================================
+
+bool SndFileSong::SetSubsong(int track)
+{
+	return false;
+}
+
+//==========================================================================
+//
+// SndFileSong :: GetStats
+//
+//==========================================================================
+
+FString SndFileSong::GetStats()
+{
+	FString out;
+	
+	size_t SamplePos;
+	
+	SamplePos = Decoder->getSampleOffset();
+	int time = int (SamplePos / SampleRate);
+	
+	out.Format(
+		"Track: " TEXTCOLOR_YELLOW "%s, %dHz" TEXTCOLOR_NORMAL
+		"  Time:" TEXTCOLOR_YELLOW "%02d:%02d" TEXTCOLOR_NORMAL,
+		Channels == 2? "Stereo" : "Mono", SampleRate,
+		time/60,
+		time % 60);
+	return out;
+}
+
+//==========================================================================
+//
+// SndFileSong :: Read													STATIC
+//
+//==========================================================================
+
+bool SndFileSong::Read(SoundStream *stream, void *vbuff, int ilen, void *userdata)
+{
+	char *buff = (char*)vbuff;
+	SndFileSong *song = (SndFileSong *)userdata;
+	song->CritSec.Enter();
+	
+	size_t len = size_t(ilen);
+	size_t currentpos = song->Decoder->getSampleOffset();
+	size_t framestoread = len / (song->Channels*2);
+	bool err = false;
+	if (!song->m_Looping)
+	{
+		size_t maxpos = song->Decoder->getSampleLength();
+		if (currentpos == maxpos)
+		{
+			memset(buff, 0, len);
+			song->CritSec.Leave();
+			return false;
+		}
+		if (currentpos + framestoread > maxpos)
+		{
+			size_t got = song->Decoder->read(buff, (maxpos - currentpos) * song->Channels * 2);
+			memset(buff + got, 0, len - got);
+		}
+		else
+		{
+			size_t got = song->Decoder->read(buff, len);
+			err = (got != len);
+		}
+	}
+	else
+	{
+		if (currentpos + framestoread > song->Loop_End)
+		{
+			size_t endblock = (song->Loop_End - currentpos) * song->Channels * 2;
+			err = (song->Decoder->read(buff, endblock) != endblock);
+			buff = buff + endblock;
+			len -= endblock;
+			song->Decoder->seek(song->Loop_Start, false);
+		}
+		err |= song->Decoder->read(buff, len) != len;
+	}
+	song->CritSec.Leave();
+	return !err;
+}
diff --git a/src/sound/oalsound.cpp b/src/sound/oalsound.cpp
index 90639624a..d38435be1 100644
--- a/src/sound/oalsound.cpp
+++ b/src/sound/oalsound.cpp
@@ -212,7 +212,7 @@ class OpenALSoundStream : public SoundStream
 		size_t got = self->Decoder->read((char*)ptr, length);
 		if(got < (unsigned int)length)
 		{
-			if(!self->Looping || !self->Decoder->seek(0))
+			if(!self->Looping || !self->Decoder->seek(0, false))
 				return false;
 			got += self->Decoder->read((char*)ptr+got, length-got);
 		}
@@ -361,7 +361,7 @@ public:
 	virtual bool SetPosition(unsigned int ms_pos)
 	{
 		std::unique_lock<std::mutex> lock(Renderer->StreamLock);
-		if(!Decoder->seek(ms_pos))
+		if(!Decoder->seek(ms_pos, true))
 			return false;
 
 		if(!Playing.load())
diff --git a/src/sound/sndfile_decoder.cpp b/src/sound/sndfile_decoder.cpp
index 5a957eb27..9d0d2331b 100644
--- a/src/sound/sndfile_decoder.cpp
+++ b/src/sound/sndfile_decoder.cpp
@@ -132,9 +132,9 @@ TArray<char> SndFileDecoder::readAll()
     return output;
 }
 
-bool SndFileDecoder::seek(size_t ms_offset)
+bool SndFileDecoder::seek(size_t ms_offset, bool ms)
 {
-    size_t smp_offset = (size_t)((double)ms_offset / 1000. * SndInfo.samplerate);
+    size_t smp_offset = ms? (size_t)((double)ms_offset / 1000. * SndInfo.samplerate) : ms_offset;
     if(sf_seek(SndFile, smp_offset, SEEK_SET) < 0)
         return false;
     return true;
diff --git a/src/sound/sndfile_decoder.h b/src/sound/sndfile_decoder.h
index f53f7e52a..0c4cfe935 100644
--- a/src/sound/sndfile_decoder.h
+++ b/src/sound/sndfile_decoder.h
@@ -13,7 +13,7 @@ struct SndFileDecoder : public SoundDecoder
 
     virtual size_t read(char *buffer, size_t bytes);
     virtual TArray<char> readAll();
-    virtual bool seek(size_t ms_offset);
+    virtual bool seek(size_t ms_offset, bool ms);
     virtual size_t getSampleOffset();
     virtual size_t getSampleLength();