Skip to content

Commit 29132eb

Browse files
committed
Fixed django#17788 -- Added batch_size argument to qs.bulk_create()
The qs.bulk_create() method did not work with large batches together with SQLite3. This commit adds a way to split the bulk into smaller batches. The default batch size is unlimited except for SQLite3 where the batch size is limited to 999 SQL parameters per batch. Thanks to everybody who participated in the discussions at Trac.
1 parent fcad6c4 commit 29132eb

File tree

8 files changed

+110
-38
lines changed

8 files changed

+110
-38
lines changed

django/db/backends/__init__.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,14 @@ def autoinc_sql(self, table, column):
475475
"""
476476
return None
477477

478+
def bulk_batch_size(self, fields, objs):
479+
"""
480+
Returns the maximum allowed batch size for the backend. The fields
481+
are the fields going to be inserted in the batch, the objs contains
482+
all the objects to be inserted.
483+
"""
484+
return len(objs)
485+
478486
def cache_key_culling_sql(self):
479487
"""
480488
Returns a SQL query that retrieves the first cache key greater than the
@@ -522,6 +530,17 @@ def deferrable_sql(self):
522530
"""
523531
return ''
524532

533+
def distinct_sql(self, fields):
534+
"""
535+
Returns an SQL DISTINCT clause which removes duplicate rows from the
536+
result set. If any fields are given, only the given fields are being
537+
checked for duplicates.
538+
"""
539+
if fields:
540+
raise NotImplementedError('DISTINCT ON fields is not supported by this database backend')
541+
else:
542+
return 'DISTINCT'
543+
525544
def drop_foreignkey_sql(self):
526545
"""
527546
Returns the SQL command that drops a foreign key.
@@ -577,17 +596,6 @@ def fulltext_search_sql(self, field_name):
577596
"""
578597
raise NotImplementedError('Full-text search is not implemented for this database backend')
579598

580-
def distinct_sql(self, fields):
581-
"""
582-
Returns an SQL DISTINCT clause which removes duplicate rows from the
583-
result set. If any fields are given, only the given fields are being
584-
checked for duplicates.
585-
"""
586-
if fields:
587-
raise NotImplementedError('DISTINCT ON fields is not supported by this database backend')
588-
else:
589-
return 'DISTINCT'
590-
591599
def last_executed_query(self, cursor, sql, params):
592600
"""
593601
Returns a string of the query last executed by the given cursor, with

django/db/backends/sqlite3/base.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class DatabaseFeatures(BaseDatabaseFeatures):
8585
supports_1000_query_parameters = False
8686
supports_mixed_date_datetime_comparisons = False
8787
has_bulk_insert = True
88-
can_combine_inserts_with_and_without_auto_increment_pk = True
88+
can_combine_inserts_with_and_without_auto_increment_pk = False
8989

9090
@cached_property
9191
def supports_stddev(self):
@@ -107,6 +107,13 @@ def supports_stddev(self):
107107
return has_support
108108

109109
class DatabaseOperations(BaseDatabaseOperations):
110+
def bulk_batch_size(self, fields, objs):
111+
"""
112+
SQLite has a compile-time default (SQLITE_LIMIT_VARIABLE_NUMBER) of
113+
999 variables per query.
114+
"""
115+
return (999 // len(fields)) if len(fields) > 0 else len(objs)
116+
110117
def date_extract_sql(self, lookup_type, field_name):
111118
# sqlite doesn't support extract, so we fake it with the user-defined
112119
# function django_extract that's registered in connect(). Note that

django/db/models/query.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ def create(self, **kwargs):
388388
obj.save(force_insert=True, using=self.db)
389389
return obj
390390

391-
def bulk_create(self, objs):
391+
def bulk_create(self, objs, batch_size=None):
392392
"""
393393
Inserts each of the instances into the database. This does *not* call
394394
save() on each of the instances, does not send any pre/post save
@@ -401,8 +401,10 @@ def bulk_create(self, objs):
401401
# this could be implemented if you didn't have an autoincrement pk,
402402
# and 2) you could do it by doing O(n) normal inserts into the parent
403403
# tables to get the primary keys back, and then doing a single bulk
404-
# insert into the childmost table. We're punting on these for now
405-
# because they are relatively rare cases.
404+
# insert into the childmost table. Some databases might allow doing
405+
# this by using RETURNING clause for the insert query. We're punting
406+
# on these for now because they are relatively rare cases.
407+
assert batch_size is None or batch_size > 0
406408
if self.model._meta.parents:
407409
raise ValueError("Can't bulk create an inherited model")
408410
if not objs:
@@ -418,13 +420,14 @@ def bulk_create(self, objs):
418420
try:
419421
if (connection.features.can_combine_inserts_with_and_without_auto_increment_pk
420422
and self.model._meta.has_auto_field):
421-
self.model._base_manager._insert(objs, fields=fields, using=self.db)
423+
self._batched_insert(objs, fields, batch_size)
422424
else:
423425
objs_with_pk, objs_without_pk = partition(lambda o: o.pk is None, objs)
424426
if objs_with_pk:
425-
self.model._base_manager._insert(objs_with_pk, fields=fields, using=self.db)
427+
self._batched_insert(objs_with_pk, fields, batch_size)
426428
if objs_without_pk:
427-
self.model._base_manager._insert(objs_without_pk, fields=[f for f in fields if not isinstance(f, AutoField)], using=self.db)
429+
fields= [f for f in fields if not isinstance(f, AutoField)]
430+
self._batched_insert(objs_without_pk, fields, batch_size)
428431
if forced_managed:
429432
transaction.commit(using=self.db)
430433
else:
@@ -860,6 +863,20 @@ def db(self):
860863
###################
861864
# PRIVATE METHODS #
862865
###################
866+
def _batched_insert(self, objs, fields, batch_size):
867+
"""
868+
A little helper method for bulk_insert to insert the bulk one batch
869+
at a time. Inserts recursively a batch from the front of the bulk and
870+
then _batched_insert() the remaining objects again.
871+
"""
872+
if not objs:
873+
return
874+
ops = connections[self.db].ops
875+
batch_size = (batch_size or max(ops.bulk_batch_size(fields, objs), 1))
876+
for batch in [objs[i:i+batch_size]
877+
for i in range(0, len(objs), batch_size)]:
878+
self.model._base_manager._insert(batch, fields=fields,
879+
using=self.db)
863880

864881
def _clone(self, klass=None, setup=False, **kwargs):
865882
if klass is None:

docs/ref/models/querysets.txt

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,7 +1350,7 @@ has a side effect on your data. For more, see `Safe methods`_ in the HTTP spec.
13501350
bulk_create
13511351
~~~~~~~~~~~
13521352

1353-
.. method:: bulk_create(objs)
1353+
.. method:: bulk_create(objs, batch_size=None)
13541354

13551355
.. versionadded:: 1.4
13561356

@@ -1372,20 +1372,12 @@ This has a number of caveats though:
13721372
* If the model's primary key is an :class:`~django.db.models.AutoField` it
13731373
does not retrieve and set the primary key attribute, as ``save()`` does.
13741374

1375-
.. admonition:: Limits of SQLite
1375+
The ``batch_size`` parameter controls how many objects are created in single
1376+
query. The default is to create all objects in one batch, except for SQLite
1377+
where the default is such that at maximum 999 variables per query is used.
13761378

1377-
SQLite sets a limit on the number of parameters per SQL statement. The
1378-
maximum is defined by the SQLITE_MAX_VARIABLE_NUMBER_ compilation option,
1379-
which defaults to 999. For instance, if your model has 8 fields (including
1380-
the primary key), you cannot create more than 999 // 8 = 124 instances at
1381-
a time. If you exceed this limit, you'll get an exception::
1382-
1383-
django.db.utils.DatabaseError: too many SQL variables
1384-
1385-
If your application's performance requirements exceed SQLite's limits, you
1386-
should switch to another database engine, such as PostgreSQL.
1387-
1388-
.. _SQLITE_MAX_VARIABLE_NUMBER: http://sqlite.org/limits.html#max_variable_number
1379+
.. versionadded:: 1.5
1380+
The ``batch_size`` parameter was added in version 1.5.
13891381

13901382
count
13911383
~~~~~

docs/releases/1.5.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,11 @@ Django 1.5 also includes several smaller improvements worth noting:
106106
* The :ref:`receiver <connecting-receiver-functions>` decorator is now able to
107107
connect to more than one signal by supplying a list of signals.
108108

109+
* :meth:`QuerySet.bulk_create()
110+
<django.db.models.query.QuerySet.bulk_create>` has now a batch_size
111+
argument. By default the batch_size is unlimited except for SQLite where
112+
single batch is limited so that 999 parameters per query isn't exceeded.
113+
109114
Backwards incompatible changes in 1.5
110115
=====================================
111116

tests/regressiontests/bulk_create/models.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,8 @@ class Pizzeria(Restaurant):
1818
pass
1919

2020
class State(models.Model):
21-
two_letter_code = models.CharField(max_length=2, primary_key=True)
21+
two_letter_code = models.CharField(max_length=2, primary_key=True)
22+
23+
class TwoFields(models.Model):
24+
f1 = models.IntegerField(unique=True)
25+
f2 = models.IntegerField(unique=True)

tests/regressiontests/bulk_create/tests.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22

33
from operator import attrgetter
44

5-
from django.test import TestCase, skipIfDBFeature, skipUnlessDBFeature
5+
from django.db import connection
6+
from django.test import TestCase, skipIfDBFeature
7+
from django.test.utils import override_settings
68

7-
from .models import Country, Restaurant, Pizzeria, State
9+
from .models import Country, Restaurant, Pizzeria, State, TwoFields
810

911

1012
class BulkCreateTests(TestCase):
@@ -27,7 +29,6 @@ def test_simple(self):
2729
self.assertEqual(created, [])
2830
self.assertEqual(Country.objects.count(), 4)
2931

30-
@skipUnlessDBFeature("has_bulk_insert")
3132
def test_efficiency(self):
3233
with self.assertNumQueries(1):
3334
Country.objects.bulk_create(self.data)
@@ -69,3 +70,42 @@ def test_zero_as_autoval(self):
6970
invalid_country = Country(id=0, name='Poland', iso_two_letter='PL')
7071
with self.assertRaises(ValueError):
7172
Country.objects.bulk_create([valid_country, invalid_country])
73+
74+
def test_large_batch(self):
75+
with override_settings(DEBUG=True):
76+
connection.queries = []
77+
TwoFields.objects.bulk_create([
78+
TwoFields(f1=i, f2=i+1) for i in range(0, 1001)
79+
])
80+
self.assertTrue(len(connection.queries) < 10)
81+
self.assertEqual(TwoFields.objects.count(), 1001)
82+
self.assertEqual(
83+
TwoFields.objects.filter(f1__gte=450, f1__lte=550).count(),
84+
101)
85+
self.assertEqual(TwoFields.objects.filter(f2__gte=901).count(), 101)
86+
87+
def test_large_batch_mixed(self):
88+
"""
89+
Test inserting a large batch with objects having primary key set
90+
mixed together with objects without PK set.
91+
"""
92+
with override_settings(DEBUG=True):
93+
connection.queries = []
94+
TwoFields.objects.bulk_create([
95+
TwoFields(id=i if i % 2 == 0 else None, f1=i, f2=i+1)
96+
for i in range(100000, 101000)])
97+
self.assertTrue(len(connection.queries) < 10)
98+
self.assertEqual(TwoFields.objects.count(), 1000)
99+
# We can't assume much about the ID's created, except that the above
100+
# created IDs must exist.
101+
id_range = range(100000, 101000, 2)
102+
self.assertEqual(TwoFields.objects.filter(id__in=id_range).count(), 500)
103+
self.assertEqual(TwoFields.objects.exclude(id__in=id_range).count(), 500)
104+
105+
def test_explicit_batch_size(self):
106+
objs = [TwoFields(f1=i, f2=i) for i in range(0, 100)]
107+
with self.assertNumQueries(2):
108+
TwoFields.objects.bulk_create(objs, 50)
109+
TwoFields.objects.all().delete()
110+
with self.assertNumQueries(1):
111+
TwoFields.objects.bulk_create(objs, len(objs))

tests/regressiontests/queries/tests.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1879,8 +1879,7 @@ def test_ticket14244(self):
18791879
# Test that the "in" lookup works with lists of 1000 items or more.
18801880
Number.objects.all().delete()
18811881
numbers = range(2500)
1882-
for num in numbers:
1883-
_ = Number.objects.create(num=num)
1882+
Number.objects.bulk_create(Number(num=num) for num in numbers)
18841883
self.assertEqual(
18851884
Number.objects.filter(num__in=numbers[:1000]).count(),
18861885
1000

0 commit comments

Comments
 (0)