Skip to content

Commit f3f7266

Browse files
authored
better logic to handling unicode and chunking. (elastic#870)
Chunking was looking at the length of a string but not accounting for the byte length. closes elastic#716
1 parent bafe659 commit f3f7266

File tree

2 files changed

+14
-3
lines changed

2 files changed

+14
-3
lines changed

elasticsearch/helpers/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,12 @@ def _chunk_actions(actions, chunk_size, max_chunk_bytes, serializer):
5858
for action, data in actions:
5959
raw_data, raw_action = data, action
6060
action = serializer.dumps(action)
61-
cur_size = len(action) + 1
61+
# +1 to account for the trailing new line character
62+
cur_size = len(action.encode('utf-8')) + 1
6263

6364
if data is not None:
6465
data = serializer.dumps(data)
65-
cur_size += len(data) + 1
66+
cur_size += len(data.encode('utf-8')) + 1
6667

6768
# full chunk, send it and start a new one
6869
if bulk_actions and (size + cur_size > max_chunk_bytes or action_count == chunk_size):

test_elasticsearch/test_helpers.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf-8 -*-
12
import mock
23
import time
34
import threading
@@ -46,14 +47,23 @@ def test_chunk_sent_from_different_threads(self, _process_bulk_chunk):
4647
class TestChunkActions(TestCase):
4748
def setUp(self):
4849
super(TestChunkActions, self).setUp()
49-
self.actions = [({'index': {}}, {'some': 'data', 'i': i}) for i in range(100)]
50+
self.actions = [({'index': {}}, {'some': u'datá', 'i': i}) for i in range(100)]
5051

5152
def test_chunks_are_chopped_by_byte_size(self):
5253
self.assertEquals(100, len(list(helpers._chunk_actions(self.actions, 100000, 1, JSONSerializer()))))
5354

5455
def test_chunks_are_chopped_by_chunk_size(self):
5556
self.assertEquals(10, len(list(helpers._chunk_actions(self.actions, 10, 99999999, JSONSerializer()))))
5657

58+
def test_chunks_are_chopped_by_byte_size_properly(self):
59+
max_byte_size = 170
60+
chunks = list(helpers._chunk_actions(self.actions, 100000, max_byte_size, JSONSerializer()))
61+
self.assertEquals(25, len(chunks))
62+
for chunk_data, chunk_actions in chunks:
63+
chunk = u''.join(chunk_actions)
64+
chunk = chunk if isinstance(chunk, str) else chunk.encode('utf-8')
65+
self.assertLessEqual(len(chunk), max_byte_size)
66+
5767
class TestExpandActions(TestCase):
5868
def test_string_actions_are_marked_as_simple_inserts(self):
5969
self.assertEquals(('{"index":{}}', "whatever"), helpers.expand_action('whatever'))

0 commit comments

Comments
 (0)