Skip to content
This repository was archived by the owner on Nov 20, 2018. It is now read-only.

Commit 15649b7

Browse files
committed
Faster SubMatch implementation
Submatch has been sped up by implementing a modified Boyer–Moore–Horspool algorithm with an average-case complexity of O(N) on random text. Worst case, it behaves similarly to the previous implementation O(MN), where M is the length of the boundary and N is the length of the buffer to operate on. Method SubMatch looks for two things: 1. Whether the byte array segment fully contains the boundary, or 2. Whether the byte array ends with the start of the boundary. Case 1 is now a lot faster than the previous implementation. Case 2 remains using the same code as before. The method will do Case 1 until the matchOffset is equal to N-M. It then switches to Case 2, unless a match is found. The code can be further sped up with a full Boyer–Moore implementation, or something more sophisticated. This however can be evaluated in the case that this implementation is insufficiently performant for our main scenarios. This commit resolves issue #575.
1 parent 9794074 commit 15649b7

File tree

4 files changed

+189
-29
lines changed

4 files changed

+189
-29
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Copyright (c) .NET Foundation. All rights reserved.
2+
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
3+
4+
using System;
5+
using System.Text;
6+
7+
namespace Microsoft.AspNetCore.WebUtilities
8+
{
9+
internal class MultipartBoundary
10+
{
11+
private readonly int[] _skipTable = new int[256];
12+
private readonly string _boundary;
13+
private bool _expectLeadingCrlf;
14+
15+
public MultipartBoundary(string boundary, bool expectLeadingCrlf = true)
16+
{
17+
if (boundary == null)
18+
{
19+
throw new ArgumentNullException(nameof(boundary));
20+
}
21+
22+
_boundary = boundary;
23+
_expectLeadingCrlf = expectLeadingCrlf;
24+
Initialize(_boundary, _expectLeadingCrlf);
25+
}
26+
27+
private void Initialize(string boundary, bool expectLeadingCrlf)
28+
{
29+
if (expectLeadingCrlf)
30+
{
31+
BoundaryBytes = Encoding.UTF8.GetBytes("\r\n--" + boundary);
32+
}
33+
else
34+
{
35+
BoundaryBytes = Encoding.UTF8.GetBytes("--" + boundary);
36+
}
37+
FinalBoundaryLength = BoundaryBytes.Length + 2; // Include the final '--' terminator.
38+
39+
var length = BoundaryBytes.Length;
40+
for (var i = 0; i < _skipTable.Length; ++i)
41+
{
42+
_skipTable[i] = length;
43+
}
44+
for (var i = 0; i < length; ++i)
45+
{
46+
_skipTable[BoundaryBytes[i]] = Math.Max(1, length - 1 - i);
47+
}
48+
}
49+
50+
public int GetSkipValue(byte input)
51+
{
52+
return _skipTable[input];
53+
}
54+
55+
public bool ExpectLeadingCrlf
56+
{
57+
get { return _expectLeadingCrlf; }
58+
set
59+
{
60+
if (value != _expectLeadingCrlf)
61+
{
62+
_expectLeadingCrlf = value;
63+
Initialize(_boundary, _expectLeadingCrlf);
64+
}
65+
}
66+
}
67+
68+
public byte[] BoundaryBytes { get; private set; }
69+
70+
public int FinalBoundaryLength { get; private set; }
71+
}
72+
}

src/Microsoft.AspNetCore.WebUtilities/MultipartReader.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ public class MultipartReader
1717
private const int DefaultBufferSize = 1024 * 4;
1818

1919
private readonly BufferedReadStream _stream;
20-
private readonly string _boundary;
20+
private readonly MultipartBoundary _boundary;
2121
private MultipartReaderStream _currentStream;
2222

2323
public MultipartReader(string boundary, Stream stream)
@@ -42,9 +42,9 @@ public MultipartReader(string boundary, Stream stream, int bufferSize)
4242
throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Insufficient buffer space, the buffer must be larger than the boundary: " + boundary);
4343
}
4444
_stream = new BufferedReadStream(stream, bufferSize);
45-
_boundary = boundary;
45+
_boundary = new MultipartBoundary(boundary, false);
4646
// This stream will drain any preamble data and remove the first boundary marker.
47-
_currentStream = new MultipartReaderStream(_stream, _boundary, expectLeadingCrlf: false);
47+
_currentStream = new MultipartReaderStream(_stream, _boundary);
4848
}
4949

5050
/// <summary>
@@ -69,6 +69,7 @@ public MultipartReader(string boundary, Stream stream, int bufferSize)
6969
return null;
7070
}
7171
var headers = await ReadHeadersAsync(cancellationToken);
72+
_boundary.ExpectLeadingCrlf = true;
7273
_currentStream = new MultipartReaderStream(_stream, _boundary);
7374
long? baseStreamOffset = _stream.CanSeek ? (long?)_stream.Position : null;
7475
return new MultipartSection() { Headers = headers, Body = _currentStream, BaseStreamOffset = baseStreamOffset };

src/Microsoft.AspNetCore.WebUtilities/MultipartReaderStream.cs

Lines changed: 53 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,15 @@
44
using System;
55
using System.Diagnostics;
66
using System.IO;
7-
using System.Text;
87
using System.Threading;
98
using System.Threading.Tasks;
109

1110
namespace Microsoft.AspNetCore.WebUtilities
1211
{
1312
internal class MultipartReaderStream : Stream
1413
{
14+
private readonly MultipartBoundary _boundary;
1515
private readonly BufferedReadStream _innerStream;
16-
private readonly byte[] _boundaryBytes;
17-
private readonly int _finalBoundaryLength;
1816
private readonly long _innerOffset;
1917
private long _position;
2018
private long _observedLength;
@@ -25,8 +23,7 @@ internal class MultipartReaderStream : Stream
2523
/// </summary>
2624
/// <param name="stream">The <see cref="BufferedReadStream"/>.</param>
2725
/// <param name="boundary">The boundary pattern to use.</param>
28-
/// <param name="expectLeadingCrlf">Specifies whether a leading crlf should be expected.</param>
29-
public MultipartReaderStream(BufferedReadStream stream, string boundary, bool expectLeadingCrlf = true)
26+
public MultipartReaderStream(BufferedReadStream stream, MultipartBoundary boundary)
3027
{
3128
if (stream == null)
3229
{
@@ -40,15 +37,7 @@ public MultipartReaderStream(BufferedReadStream stream, string boundary, bool ex
4037

4138
_innerStream = stream;
4239
_innerOffset = _innerStream.CanSeek ? _innerStream.Position : 0;
43-
if (expectLeadingCrlf)
44-
{
45-
_boundaryBytes = Encoding.UTF8.GetBytes("\r\n--" + boundary);
46-
}
47-
else
48-
{
49-
_boundaryBytes = Encoding.UTF8.GetBytes("--" + boundary);
50-
}
51-
_finalBoundaryLength = _boundaryBytes.Length + 2; // Include the final '--' terminator.
40+
_boundary = boundary;
5241
}
5342

5443
public bool FinalBoundaryFound { get; private set; }
@@ -205,7 +194,7 @@ public override int Read(byte[] buffer, int offset, int count)
205194
}
206195

207196
PositionInnerStream();
208-
if (!_innerStream.EnsureBuffered(_finalBoundaryLength))
197+
if (!_innerStream.EnsureBuffered(_boundary.FinalBoundaryLength))
209198
{
210199
throw new IOException("Unexpected end of stream.");
211200
}
@@ -215,20 +204,20 @@ public override int Read(byte[] buffer, int offset, int count)
215204
int matchOffset;
216205
int matchCount;
217206
int read;
218-
if (SubMatch(bufferedData, _boundaryBytes, out matchOffset, out matchCount))
207+
if (SubMatch(bufferedData, _boundary.BoundaryBytes, out matchOffset, out matchCount))
219208
{
220209
// We found a possible match, return any data before it.
221210
if (matchOffset > bufferedData.Offset)
222211
{
223212
read = _innerStream.Read(buffer, offset, Math.Min(count, matchOffset - bufferedData.Offset));
224213
return UpdatePosition(read);
225214
}
226-
Debug.Assert(matchCount == _boundaryBytes.Length);
215+
Debug.Assert(matchCount == _boundary.BoundaryBytes.Length);
227216

228217
// "The boundary may be followed by zero or more characters of
229218
// linear whitespace. It is then terminated by either another CRLF"
230219
// or -- for the final boundary.
231-
byte[] boundary = new byte[_boundaryBytes.Length];
220+
byte[] boundary = new byte[_boundary.BoundaryBytes.Length];
232221
read = _innerStream.Read(boundary, 0, boundary.Length);
233222
Debug.Assert(read == boundary.Length); // It should have all been buffered
234223
var remainder = _innerStream.ReadLine(lengthLimit: 100); // Whitespace may exceed the buffer.
@@ -256,7 +245,7 @@ public override async Task<int> ReadAsync(byte[] buffer, int offset, int count,
256245
}
257246

258247
PositionInnerStream();
259-
if (!await _innerStream.EnsureBufferedAsync(_finalBoundaryLength, cancellationToken))
248+
if (!await _innerStream.EnsureBufferedAsync(_boundary.FinalBoundaryLength, cancellationToken))
260249
{
261250
throw new IOException("Unexpected end of stream.");
262251
}
@@ -266,7 +255,7 @@ public override async Task<int> ReadAsync(byte[] buffer, int offset, int count,
266255
int matchOffset;
267256
int matchCount;
268257
int read;
269-
if (SubMatch(bufferedData, _boundaryBytes, out matchOffset, out matchCount))
258+
if (SubMatch(bufferedData, _boundary.BoundaryBytes, out matchOffset, out matchCount))
270259
{
271260
// We found a possible match, return any data before it.
272261
if (matchOffset > bufferedData.Offset)
@@ -275,12 +264,12 @@ public override async Task<int> ReadAsync(byte[] buffer, int offset, int count,
275264
read = _innerStream.Read(buffer, offset, Math.Min(count, matchOffset - bufferedData.Offset));
276265
return UpdatePosition(read);
277266
}
278-
Debug.Assert(matchCount == _boundaryBytes.Length);
267+
Debug.Assert(matchCount == _boundary.BoundaryBytes.Length);
279268

280269
// "The boundary may be followed by zero or more characters of
281270
// linear whitespace. It is then terminated by either another CRLF"
282271
// or -- for the final boundary.
283-
byte[] boundary = new byte[_boundaryBytes.Length];
272+
byte[] boundary = new byte[_boundary.BoundaryBytes.Length];
284273
read = _innerStream.Read(boundary, 0, boundary.Length);
285274
Debug.Assert(read == boundary.Length); // It should have all been buffered
286275
var remainder = await _innerStream.ReadLineAsync(lengthLimit: 100, cancellationToken: cancellationToken); // Whitespace may exceed the buffer.
@@ -300,18 +289,44 @@ public override async Task<int> ReadAsync(byte[] buffer, int offset, int count,
300289
return UpdatePosition(read);
301290
}
302291

303-
// Does Segment1 contain all of segment2, or does it end with the start of segment2?
292+
// Does segment1 contain all of matchBytes, or does it end with the start of matchBytes?
304293
// 1: AAAAABBBBBCCCCC
305294
// 2: BBBBB
306295
// Or:
307296
// 1: AAAAABBB
308297
// 2: BBBBB
309-
private static bool SubMatch(ArraySegment<byte> segment1, byte[] matchBytes, out int matchOffset, out int matchCount)
298+
private bool SubMatch(ArraySegment<byte> segment1, byte[] matchBytes, out int matchOffset, out int matchCount)
310299
{
300+
// clear matchCount to zero
301+
matchCount = 0;
302+
303+
// case 1: does segment1 fully contain matchBytes?
304+
{
305+
var matchBytesLengthMinusOne = matchBytes.Length - 1;
306+
var matchBytesLastByte = matchBytes[matchBytesLengthMinusOne];
307+
var segmentEndMinusMatchBytesLength = segment1.Offset + segment1.Count - matchBytes.Length;
308+
309+
matchOffset = segment1.Offset;
310+
while (matchOffset < segmentEndMinusMatchBytesLength)
311+
{
312+
var lookaheadTailChar = segment1.Array[matchOffset + matchBytesLengthMinusOne];
313+
if (lookaheadTailChar == matchBytesLastByte &&
314+
CompareBuffers(segment1.Array, matchOffset, matchBytes, 0, matchBytesLengthMinusOne) == 0)
315+
{
316+
matchCount = matchBytes.Length;
317+
return true;
318+
}
319+
matchOffset += _boundary.GetSkipValue(lookaheadTailChar);
320+
}
321+
}
322+
323+
// case 2: does segment1 end with the start of matchBytes?
324+
var segmentEnd = segment1.Offset + segment1.Count;
325+
311326
matchCount = 0;
312-
for (matchOffset = segment1.Offset; matchOffset < segment1.Offset + segment1.Count; matchOffset++)
327+
for (; matchOffset < segmentEnd; matchOffset++)
313328
{
314-
int countLimit = segment1.Offset - matchOffset + segment1.Count;
329+
var countLimit = segmentEnd - matchOffset;
315330
for (matchCount = 0; matchCount < matchBytes.Length && matchCount < countLimit; matchCount++)
316331
{
317332
if (matchBytes[matchCount] != segment1.Array[matchOffset + matchCount])
@@ -327,5 +342,17 @@ private static bool SubMatch(ArraySegment<byte> segment1, byte[] matchBytes, out
327342
}
328343
return matchCount > 0;
329344
}
345+
346+
private static int CompareBuffers(byte[] buffer1, int offset1, byte[] buffer2, int offset2, int count)
347+
{
348+
for (; count-- > 0; offset1++, offset2++)
349+
{
350+
if (buffer1[offset1] != buffer2[offset2])
351+
{
352+
return buffer1[offset1] - buffer2[offset2];
353+
}
354+
}
355+
return 0;
356+
}
330357
}
331358
}

test/Microsoft.AspNetCore.WebUtilities.Tests/MultipartReaderTests.cs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) .NET Foundation. All rights reserved.
22
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
33

4+
using System;
45
using System.IO;
56
using System.Text;
67
using System.Threading.Tasks;
@@ -74,11 +75,29 @@ public class MultipartReaderTests
7475
"\r\n" +
7576
"--9051914041544843365972754266--\r\n";
7677

78+
private const string TwoPartBodyIncompleteBuffer =
79+
"--9051914041544843365972754266\r\n" +
80+
"Content-Disposition: form-data; name=\"text\"\r\n" +
81+
"\r\n" +
82+
"text default\r\n" +
83+
"--9051914041544843365972754266\r\n" +
84+
"Content-Disposition: form-data; name=\"file1\"; filename=\"a.txt\"\r\n" +
85+
"Content-Type: text/plain\r\n" +
86+
"\r\n" +
87+
"Content of a.txt.\r\n" +
88+
"\r\n" +
89+
"--9051914041544843365";
90+
7791
private static MemoryStream MakeStream(string text)
7892
{
7993
return new MemoryStream(Encoding.UTF8.GetBytes(text));
8094
}
8195

96+
private static string GetString(byte[] buffer, int count)
97+
{
98+
return Encoding.ASCII.GetString(buffer, 0, count);
99+
}
100+
82101
[Fact]
83102
public async Task MutipartReader_ReadSinglePartBody_Success()
84103
{
@@ -217,6 +236,47 @@ public async Task MutipartReader_ThreePartBody_Success()
217236
Assert.Null(await reader.ReadNextSectionAsync());
218237
}
219238

239+
[Fact]
240+
public void MutipartReader_BufferSizeMustBeLargerThanBoundary_Throws()
241+
{
242+
var stream = MakeStream(ThreePartBody);
243+
Assert.Throws<ArgumentOutOfRangeException>(() =>
244+
{
245+
var reader = new MultipartReader(Boundary, stream, 5);
246+
});
247+
}
248+
249+
[Fact]
250+
public async Task MutipartReader_TwoPartBodyIncompleteBuffer_TwoSectionsReadSuccessfullyThirdSectionThrows()
251+
{
252+
var stream = MakeStream(TwoPartBodyIncompleteBuffer);
253+
var reader = new MultipartReader(Boundary, stream);
254+
var buffer = new byte[128];
255+
256+
//first section can be read successfully
257+
var section = await reader.ReadNextSectionAsync();
258+
Assert.NotNull(section);
259+
Assert.Equal(1, section.Headers.Count);
260+
Assert.Equal("form-data; name=\"text\"", section.Headers["Content-Disposition"][0]);
261+
var read = section.Body.Read(buffer, 0, buffer.Length);
262+
Assert.Equal("text default", GetString(buffer, read));
263+
264+
//second section can be read successfully (even though the bottom boundary is truncated)
265+
section = await reader.ReadNextSectionAsync();
266+
Assert.NotNull(section);
267+
Assert.Equal(2, section.Headers.Count);
268+
Assert.Equal("form-data; name=\"file1\"; filename=\"a.txt\"", section.Headers["Content-Disposition"][0]);
269+
Assert.Equal("text/plain", section.Headers["Content-Type"][0]);
270+
read = section.Body.Read(buffer, 0, buffer.Length);
271+
Assert.Equal("Content of a.txt.\r\n", GetString(buffer, read));
272+
273+
await Assert.ThrowsAsync<IOException>(async () =>
274+
{
275+
// we'll be unable to ensure enough bytes are buffered to even contain a final boundary
276+
section = await reader.ReadNextSectionAsync();
277+
});
278+
}
279+
220280
[Fact]
221281
public async Task MutipartReader_ReadInvalidUtf8Header_ReplacementCharacters()
222282
{

0 commit comments

Comments
 (0)