Skip to content

Commit 591c99a

Browse files
committed
Lowercase tables and simplify prepare_db scripts
1 parent baeb9e5 commit 591c99a

File tree

4 files changed

+112
-178
lines changed

4 files changed

+112
-178
lines changed

dev/prepare_db.pql

Lines changed: 79 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,18 @@ func run_sql(code) {
66

77
// Cleanup
88
func cleanup() {
9-
run_sql("DROP TABLE IF EXISTS Rating")
9+
run_sql("DROP TABLE IF EXISTS rating")
1010
run_sql("DROP TABLE IF EXISTS tmp_rating")
11-
run_sql("DROP TABLE IF EXISTS Rating_del1")
12-
run_sql("DROP TABLE IF EXISTS Rating_update1")
13-
run_sql("DROP TABLE IF EXISTS Rating_update001p")
14-
run_sql("DROP TABLE IF EXISTS Rating_update1p")
15-
run_sql("DROP TABLE IF EXISTS Rating_del1p")
16-
run_sql("DROP TABLE IF EXISTS Rating_update50p")
11+
run_sql("DROP TABLE IF EXISTS rating_del1")
12+
run_sql("DROP TABLE IF EXISTS rating_update1")
13+
run_sql("DROP TABLE IF EXISTS rating_update001p")
14+
run_sql("DROP TABLE IF EXISTS rating_update1p")
15+
run_sql("DROP TABLE IF EXISTS rating_del1p")
16+
run_sql("DROP TABLE IF EXISTS rating_update50p")
1717
commit()
1818
}
1919

20-
func cleanup_double_quote() {
21-
run_sql("DROP TABLE IF EXISTS \"Rating\"")
22-
run_sql("DROP TABLE IF EXISTS \"tmp_rating\"")
23-
run_sql("DROP TABLE IF EXISTS \"Rating_del1\"")
24-
run_sql("DROP TABLE IF EXISTS \"Rating_update1\"")
25-
run_sql("DROP TABLE IF EXISTS \"Rating_update001p\"")
26-
run_sql("DROP TABLE IF EXISTS \"Rating_update1p\"")
27-
run_sql("DROP TABLE IF EXISTS \"Rating_del1p\"")
28-
run_sql("DROP TABLE IF EXISTS \"Rating_update50p\"")
29-
commit()
30-
}
31-
32-
if (db_type == "postgres" or db_type == "redshift") {
33-
cleanup_double_quote()
34-
} else {
35-
cleanup()
36-
}
20+
cleanup()
3721

3822
// Import CSV
3923
if (db_type == "snowflake") {
@@ -46,17 +30,17 @@ if (db_type == "snowflake") {
4630

4731
table tmp_rating {
4832
id: int
49-
movieId: int
33+
movieid: int
5034
rating: float
5135
timestamp: int
5236
}
5337

5438
run_sql("COPY INTO tmp_rating FROM '@~/ratings.csv.gz' file_format=(skip_header=1)")
5539

56-
table Rating {
40+
table rating {
5741
id: int
58-
userId: int
59-
movieId: int
42+
userid: int
43+
movieid: int
6044
rating: float
6145
timestamp: int
6246
}
@@ -85,7 +69,7 @@ if (db_type == "snowflake") {
8569
IGNOREHEADER 1;
8670
""")
8771

88-
table Rating {
72+
table rating {
8973
id: int // explicit id, to avoid identity type
9074
userid: int
9175
movieid: int
@@ -97,69 +81,91 @@ if (db_type == "snowflake") {
9781
INSERT INTO rating(id, userid, movieid, rating, timestamp)
9882
SELECT row_number() over (order by userid, movieid, timestamp) AS id, userid, movieid, rating, timestamp FROM tmp_rating
9983
""")
84+
} else if (db_type == "mssql") {
85+
run_sql("drop table if exists tmp_rating")
86+
run_sql("create table tmp_rating(userid int, movieid int, rating float, timestamp int)")
87+
table tmp_rating {...}
88+
print "Loading ratings CSV"
89+
run_sql("BULK INSERT tmp_rating from 'ml-25m/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);")
90+
print "Populating actual table"
91+
rating += tmp_rating
92+
commit()
10093
} else {
10194
print "Importing ratings CSV"
10295

103-
table Rating {
104-
userId: int
105-
movieId: int
96+
table rating {
97+
userid: int
98+
movieid: int
10699
rating: float
107100
timestamp: int
108101
}
109-
import_csv(Rating, 'ml-25m/ratings.csv', true)
110-
Rating.add_index("id")
102+
import_csv(rating, 'ml-25m/ratings.csv', true)
103+
rating.add_index("id", true)
104+
rating.add_index("timestamp")
105+
run_sql("CREATE INDEX index_rating_id_timestamp ON rating (id, timestamp)")
111106
}
112107

113108
run_sql("DROP TABLE IF EXISTS tmp_rating")
114109
commit()
115110

116-
middle = count(Rating) /~ 2
111+
middle = count(rating) /~ 2
117112

118113
// Code notes:
119114
// - We use 'const table' to avoid updating the ids
120115

121-
// Rating_del1 = Delete middle row
122-
print "Create Rating_del1"
123-
const table Rating_del1 = Rating
124-
Rating_del1.add_index("id")
125-
Rating_del1[middle..(middle+1)] delete [true]
126-
assert count(Rating) == count(Rating_del1) + 1
127-
128-
// Rating_update1 = Update middle row
129-
print "Create Rating_update1"
130-
const table Rating_update1 = Rating
131-
Rating_update1.add_index("id")
132-
Rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
133-
134-
// Rating_<>p = Percentile of rows changed
135-
print "Create percentile tables"
136-
const table Rating_update001p = Rating
137-
const table Rating_update1p = Rating
138-
const table Rating_del1p = Rating
139-
const table Rating_update50p = Rating
140-
141-
Rating_update001p.add_index("id")
142-
Rating_update1p.add_index("id")
143-
Rating_del1p.add_index("id")
144-
Rating_update50p.add_index("id")
116+
print "Create tables"
117+
const table rating_del1 = rating
118+
const table rating_update1 = rating
119+
const table rating_update001p = rating
120+
const table rating_update1p = rating
121+
const table rating_del1p = rating
122+
const table rating_update50p = rating
123+
124+
print "Create indexes"
125+
if (db_type != "redshift" or db_type != "snowflake") {
126+
rating_del1.add_index("id", true)
127+
rating_del1.add_index("timestamp")
128+
run_sql("CREATE INDEX index_rating_del1_id_timestamp ON rating_del1 (id, timestamp)")
129+
rating_update1.add_index("id", true)
130+
rating_update1.add_index("timestamp")
131+
run_sql("CREATE INDEX index_rating_update1_id_timestamp ON rating_update1 (id, timestamp)")
132+
rating_update001p.add_index("id", true)
133+
rating_update001p.add_index("timestamp")
134+
run_sql("CREATE INDEX index_rating_update001p_id_timestamp ON rating_update001p (id, timestamp)")
135+
rating_update1p.add_index("id", true)
136+
rating_update1p.add_index("timestamp")
137+
run_sql("CREATE INDEX index_rating_update1p_id_timestamp ON rating_update1p (id, timestamp)")
138+
rating_del1p.add_index("id", true)
139+
rating_del1p.add_index("timestamp")
140+
run_sql("CREATE INDEX index_rating_del1p_id_timestamp ON rating_del1p (id, timestamp)")
141+
rating_update50p.add_index("id", true)
142+
rating_update50p.add_index("timestamp")
143+
run_sql("CREATE INDEX index_rating_update50p_id_timestamp ON rating_update50p (id, timestamp)")
144+
commit()
145+
}
146+
147+
print "Alter tables"
148+
rating_del1[middle..(middle+1)] delete [true]
149+
assert count(rating) == count(rating_del1) + 1
150+
rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
145151

146152
if (db_type == "postgres" or db_type == "redshift") {
147-
run_sql('UPDATE "Rating_update001p" SET "timestamp" = ("timestamp" + 1) WHERE random() < 0.0001')
148-
run_sql('UPDATE "Rating_update1p" SET "timestamp" = ("timestamp" + 1) WHERE random() < 0.01')
149-
run_sql('DELETE FROM "Rating_del1p" WHERE random() < 0.01')
150-
run_sql('UPDATE "Rating_update50p" SET "timestamp" = ("timestamp" + 1) WHERE random() < 0.5')
151-
} else if (db_type == "mysql") {
152-
run_sql('UPDATE Rating_update001p SET timestamp = (timestamp + 1) WHERE rand() < 0.0001')
153-
run_sql('UPDATE Rating_update1p SET timestamp = (timestamp + 1) WHERE rand() < 0.01')
154-
run_sql('DELETE FROM Rating_del1p WHERE rand() < 0.01')
155-
run_sql('UPDATE Rating_update50p SET timestamp = (timestamp + 1) WHERE rand() < 0.5')
153+
run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE random() < 0.0001')
154+
run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE random() < 0.01')
155+
run_sql('DELETE FROM rating_del1p WHERE random() < 0.01')
156+
run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE random() < 0.5')
157+
} else if (db_type == "mysql" or db_type == "mssql") {
158+
run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE rand() < 0.0001')
159+
run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE rand() < 0.01')
160+
run_sql('DELETE FROM rating_del1p WHERE rand() < 0.01')
161+
run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE rand() < 0.5')
156162
} else if (db_type == "snowflake") {
157-
run_sql('UPDATE Rating_update001p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.0001')
158-
run_sql('UPDATE Rating_update1p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.01')
159-
run_sql('DELETE FROM Rating_del1p WHERE uniform(0::float, 1, random()) < 0.01')
160-
run_sql('UPDATE Rating_update50p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.5')
163+
run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.0001')
164+
run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.01')
165+
run_sql('DELETE FROM rating_del1p WHERE uniform(0::float, 1, random()) < 0.01')
166+
run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.5')
161167
} else {
162168
print "Unsupported database: " + db_type
163169
}
164170

165-
commit()
171+
commit()

dev/prepare_db_bigquery.pql

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,23 @@ func run_sql(code) {
44
}
55

66
// Cleaning
7-
run_sql("DROP TABLE IF EXISTS xdiff.Rating")
7+
run_sql("DROP TABLE IF EXISTS xdiff.rating")
88
run_sql("DROP TABLE IF EXISTS xdiff.tmp_rating")
9-
run_sql("DROP TABLE IF EXISTS xdiff.Rating_del1")
10-
run_sql("DROP TABLE IF EXISTS xdiff.Rating_update1")
11-
run_sql("DROP TABLE IF EXISTS xdiff.Rating_update001p")
12-
run_sql("DROP TABLE IF EXISTS xdiff.Rating_update1p")
13-
run_sql("DROP TABLE IF EXISTS xdiff.Rating_del1p")
14-
run_sql("DROP TABLE IF EXISTS xdiff.Rating_update50p")
9+
run_sql("DROP TABLE IF EXISTS xdiff.rating_del1")
10+
run_sql("DROP TABLE IF EXISTS xdiff.rating_update1")
11+
run_sql("DROP TABLE IF EXISTS xdiff.rating_update001p")
12+
run_sql("DROP TABLE IF EXISTS xdiff.rating_update1p")
13+
run_sql("DROP TABLE IF EXISTS xdiff.rating_del1p")
14+
run_sql("DROP TABLE IF EXISTS xdiff.rating_update50p")
1515

1616
// Import CSV
1717
print "Importing the CSV through the Python script"
1818
PY("0", "import _bq_import_csv")
1919

20-
// run_sql("ALTER TABLE `datafold-dev-2.xdiff.Rating` ADD COLUMN id int")
21-
// run_sql("UPDATE `datafold-dev-2.xdiff.Rating` SET id = cast(GENERATE_UUID() as bytes) WHERE True")
20+
// run_sql("ALTER TABLE `datafold-dev-2.xdiff.rating` ADD COLUMN id int")
21+
// run_sql("UPDATE `datafold-dev-2.xdiff.rating` SET id = cast(GENERATE_UUID() as bytes) WHERE True")
2222

23-
table xdiff.Rating {
23+
table xdiff.rating {
2424
id: int // explicit id, to avoid identity type
2525
userid: int
2626
movieid: int
@@ -29,36 +29,36 @@ table xdiff.Rating {
2929
}
3030

3131
run_sql("""
32-
INSERT INTO xdiff.Rating(id, userid, movieid, rating, timestamp)
32+
INSERT INTO xdiff.rating(id, userid, movieid, rating, timestamp)
3333
SELECT row_number() over (order by userid, movieid, timestamp) AS id, userid, movieid, rating, timestamp FROM xdiff.tmp_rating
3434
""")
3535

36-
Rating = xdiff.Rating
36+
rating = xdiff.rating
3737

38-
middle = count(Rating) /~ 2
38+
middle = count(rating) /~ 2
3939

4040
// Code notes:
4141
// - We use 'const table' to avoid updating the ids
4242

43-
// Rating_del1 = Delete middle row
44-
run_sql("CREATE TABLE xdiff.Rating_del1 AS (SELECT * FROM xdiff.Rating)")
45-
table xdiff.Rating_del1{...}
46-
xdiff.Rating_del1[middle..(middle+1)] delete [true]
47-
assert count(xdiff.Rating) == count(xdiff.Rating_del1) + 1
43+
// rating_del1 = Delete middle row
44+
run_sql("CREATE TABLE xdiff.rating_del1 AS (SELECT * FROM xdiff.rating)")
45+
table xdiff.rating_del1{...}
46+
xdiff.rating_del1[middle..(middle+1)] delete [true]
47+
assert count(xdiff.rating) == count(xdiff.rating_del1) + 1
4848

49-
// Rating_del1 = Update middle row
50-
run_sql("CREATE TABLE xdiff.Rating_update1 AS (SELECT * FROM xdiff.Rating)")
51-
table xdiff.Rating_update1{...}
52-
xdiff.Rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
49+
// rating_del1 = Update middle row
50+
run_sql("CREATE TABLE xdiff.rating_update1 AS (SELECT * FROM xdiff.rating)")
51+
table xdiff.rating_update1{...}
52+
xdiff.rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
5353

54-
run_sql("CREATE TABLE xdiff.Rating_update001p AS (SELECT * FROM xdiff.Rating)")
55-
run_sql("CREATE TABLE xdiff.Rating_update1p AS (SELECT * FROM xdiff.Rating)")
56-
run_sql("CREATE TABLE xdiff.Rating_del1p AS (SELECT * FROM xdiff.Rating)")
57-
run_sql("CREATE TABLE xdiff.Rating_update50p AS (SELECT * FROM xdiff.Rating)")
54+
run_sql("CREATE TABLE xdiff.rating_update001p AS (SELECT * FROM xdiff.rating)")
55+
run_sql("CREATE TABLE xdiff.rating_update1p AS (SELECT * FROM xdiff.rating)")
56+
run_sql("CREATE TABLE xdiff.rating_del1p AS (SELECT * FROM xdiff.rating)")
57+
run_sql("CREATE TABLE xdiff.rating_update50p AS (SELECT * FROM xdiff.rating)")
5858

59-
run_sql('UPDATE xdiff.Rating_update001p SET timestamp = (timestamp + 1) WHERE rand() < 0.0001')
60-
run_sql('UPDATE xdiff.Rating_update1p SET timestamp = (timestamp + 1) WHERE rand() < 0.01')
61-
run_sql('DELETE FROM xdiff.Rating_del1p WHERE rand() < 0.01')
62-
run_sql('UPDATE xdiff.Rating_update50p SET timestamp = (timestamp + 1) WHERE rand() < 0.5')
59+
run_sql('UPDATE xdiff.rating_update001p SET timestamp = (timestamp + 1) WHERE rand() < 0.0001')
60+
run_sql('UPDATE xdiff.rating_update1p SET timestamp = (timestamp + 1) WHERE rand() < 0.01')
61+
run_sql('DELETE FROM xdiff.rating_del1p WHERE rand() < 0.01')
62+
run_sql('UPDATE xdiff.rating_update50p SET timestamp = (timestamp + 1) WHERE rand() < 0.5')
6363

6464
// commit()

dev/prepare_db_mssql.pql

Lines changed: 0 additions & 72 deletions
This file was deleted.

setup_testenv.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ unzip ml-25m.zip -d dev/
2626

2727
4. Setup databases
2828

29-
(note: bigquery and mssql have their own setup scripts)
29+
(note: bigquery has its own setup script)
3030

3131
```
3232
preql -f dev/prepare_db postgres://<uri>
@@ -35,9 +35,9 @@ preql -f dev/prepare_db mysql://<uri>
3535
3636
preql -f dev/prepare_db snowflake://<uri>
3737
38-
preql -f dev/prepare_db_bigquery bigquery:///<project>
38+
preql -f dev/prepare_db mssql://<uri>
3939
40-
preql -f dev/prepare_db_mssql mssql://<uri>
40+
preql -f dev/prepare_db_bigquery bigquery:///<project>
4141
4242
4343
etc.

0 commit comments

Comments
 (0)