|
|
| DATABASE_HOST = os.environ.get("DATABASE_HOST") |
| |
|
| DATABASE_PORT = os.environ.get("DATABASE_PORT") |
| |
|
| DATABASE_NAME = os.environ.get("DATABASE_NAME") |
| |
|
| DATABASE_USER = os.environ.get("DATABASE_USER") |
| |
|
| DATABASE_PASSWORD = os.environ.get("DATABASE_PASSWORD") |
| |
|
string | TEST_TABLE_NAME_STR = "psycopg2_insert_time_test" |
| |
| | SETUP_SQL |
| |
| | CLEANUP_SQL |
| |
|
string | COL_FNAME = "first_name" |
| |
|
string | COL_LNAME = "last_name" |
| |
|
string | COL_EMAIL = "email" |
| |
|
string | COL_GENDER = "gender" |
| |
|
string | COL_IP = "ip_address" |
| |
|
string | COL_SSN = "ssn" |
| |
|
string | COL_CMPNYNAME = "company_name" |
| |
|
string | COL_RACE = "race" |
| |
|
string | COL_DPMT = "department" |
| |
| list | fixture_columns |
| |
| list | experiment_row_counts |
| |
|
| tester = BulkInsertionTester(rec_count) |
| |
| list | rates |
| |
|
| frame = pandas.DataFrame(rates).set_index('count') |
| |
Script to profile execution times for multiple bulk insertion methods
Findings from this script align to the conclusions made in the following
tutorial: https://trvrm.github.io/bulk-psycopg2-inserts.html
Conclusion:
Using insertion script with select + unnest option is the most efficient way
of bulk insertion. Using unnest to load multiple rows simultaneously has the
following advantages:
- It is significantly faster than a regular insert loop, especially when
inserting thousands of rows. Also aster with psycopg2.extra.
execute_values and inserting list of values with a normal single insertion
script with cursor.mogrify.
- The benefits of using unnest() increase at least up to 50,000 rows
- It still allows us to write straightforward parameterised and safe SQL
with no (injection prone) string concatenations - we use psycopg2.sql
module to generate safe sql scripts.
But using `unnest` requires explicit type conversion incase of NULL values.
A little less efficient (but still order of magnitude faster than interative
approach) but more pythonic option is to use `execute_values`. `execute_values`
require a template of placeholders to be specified beforehand which is why
we should explicitly specify default values for the missing columns during
the source to destionation table mappings.
psycopg2 API resources: http://initd.org/psycopg/docs/extras.html#fast-exec
| def ody_migr_benchmarks.pg_prepare_insert_script_cursor_mogrify |
( |
|
_tbl_name, |
|
|
|
_columns, |
|
|
|
_cursor, |
|
|
|
_values_list |
|
) |
| |
Prepare insertion script
A little arduous but reasonably safe sql with placeholders and
cursor.mogrify formatted columns and values for bulk insertion.
Example script:
INSERT INTO psycopg2_insert_time_test ("first_name", "last_name",
"email", "gender", "ip_address", "ssn", "company_name", "race",
"department")
VALUES
('254.70.235.138','Product Management','Noir','Marlyn','Livefish',
'Female','mnoir0@bizjournals.com','Guamanian','651-05-8691'),
('151.11.81.28','Sales','Doxey','Berty','Skaboo','Male',
'bdoxey1@amazon.com','Malaysian','658-15-9923')
...
Definition at line 120 of file ody_migr_benchmarks.py.
120 def pg_prepare_insert_script_cursor_mogrify(_tbl_name,
124 """Prepare insertion script 126 A little arduous but reasonably safe sql with placeholders and 127 cursor.mogrify formatted columns and values for bulk insertion. 130 INSERT INTO psycopg2_insert_time_test ("first_name", "last_name", 131 "email", "gender", "ip_address", "ssn", "company_name", "race", 134 ('254.70.235.138','Product Management','Noir','Marlyn','Livefish', 135 'Female','mnoir0@bizjournals.com','Guamanian','651-05-8691'), 136 ('151.11.81.28','Sales','Doxey','Berty','Skaboo','Male', 137 'bdoxey1@amazon.com','Malaysian','658-15-9923') 140 col_names = sql.SQL(
", ").join(map(sql.Identifier, _columns))
141 placeholders =
",".join([
"%s"] * len(fixture_columns))
143 args_str =
','.join(_cursor.mogrify(
"({})".format(
144 placeholders), x).decode(
'utf-8')
for x
in _values_list)
146 return sql.SQL(
"INSERT INTO {} ({}) VALUES {}").format(
| def ody_migr_benchmarks.pg_prepare_insert_script_executemany_dict |
( |
|
_tbl_name, |
|
|
|
_columns |
|
) |
| |
Prepare insertion script
Uses named placeholders to insert list of records (dictonary)
Example script:
INSERT INTO psycopg2_insert_time_test ("first_name", "last_name", "email",
"gender", "ip_address", "ssn", "company_name", "race", "department")
VALUES (%(first_name)s, %(last_name)s, %(email)s, %(gender)s,
%(ip_address)s, %(ssn)s, %(company_name)s, %(race)s, %(department)s)
`values` attribute of cursor.executemany() must be a list of named dict
Definition at line 153 of file ody_migr_benchmarks.py.
153 def pg_prepare_insert_script_executemany_dict(_tbl_name, _columns):
154 """Prepare insertion script 156 Uses named placeholders to insert list of records (dictonary) 159 INSERT INTO psycopg2_insert_time_test ("first_name", "last_name", "email", 160 "gender", "ip_address", "ssn", "company_name", "race", "department") 161 VALUES (%(first_name)s, %(last_name)s, %(email)s, %(gender)s, 162 %(ip_address)s, %(ssn)s, %(company_name)s, %(race)s, %(department)s) 164 `values` attribute of cursor.executemany() must be a list of named dict 166 return pg_handler.pg_prepare_insert_single_dict(_tbl_name, _columns)
| def ody_migr_benchmarks.pg_prepare_insert_script_executemany_list |
( |
|
_tbl_name, |
|
|
|
_columns |
|
) |
| |
Prepare insertion script
Uses named placeholders to insert list of records (list)
Example script:
INSERT INTO "psycopg2_insert_time_test" ("first_name", "last_name",
"email", "gender", "ip_address", "ssn", "company_name", "race",
"department") VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
`values` attribute of cursor.executemany() must be a list of multiple
records - list of list (each record values must have same order
as _columns)
Definition at line 169 of file ody_migr_benchmarks.py.
169 def pg_prepare_insert_script_executemany_list(_tbl_name, _columns):
170 """Prepare insertion script 172 Uses named placeholders to insert list of records (list) 175 INSERT INTO "psycopg2_insert_time_test" ("first_name", "last_name", 176 "email", "gender", "ip_address", "ssn", "company_name", "race", 177 "department") VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) 180 `values` attribute of cursor.executemany() must be a list of multiple 181 records - list of list (each record values must have same order 184 return pg_handler.pg_prepare_insert_single_list(_tbl_name, _columns)