@@ -6,14 +6,15 @@ func run_sql(code) {
66
77// Cleanup
88func cleanup() {
9- run_sql("DROP TABLE IF EXISTS Rating ")
9+ run_sql("DROP TABLE IF EXISTS rating ")
1010 run_sql("DROP TABLE IF EXISTS tmp_rating")
11- run_sql("DROP TABLE IF EXISTS Rating_del1")
12- run_sql("DROP TABLE IF EXISTS Rating_update1")
13- run_sql("DROP TABLE IF EXISTS Rating_update001p")
14- run_sql("DROP TABLE IF EXISTS Rating_update1p")
15- run_sql("DROP TABLE IF EXISTS Rating_del1p")
16- run_sql("DROP TABLE IF EXISTS Rating_update50p")
11+ run_sql("DROP TABLE IF EXISTS rating_del1")
12+ run_sql("DROP TABLE IF EXISTS rating_update1")
13+ run_sql("DROP TABLE IF EXISTS rating_update001p")
14+ run_sql("DROP TABLE IF EXISTS rating_update1p")
15+ run_sql("DROP TABLE IF EXISTS rating_del1p")
16+ run_sql("DROP TABLE IF EXISTS rating_update50p")
17+ commit()
1718}
1819
1920cleanup()
@@ -29,17 +30,17 @@ if (db_type == "snowflake") {
2930
3031 table tmp_rating {
3132 id: int
32- movieId : int
33+ movieid : int
3334 rating: float
3435 timestamp: int
3536 }
3637
3738 run_sql("COPY INTO tmp_rating FROM '@~/ratings.csv.gz' file_format=(skip_header=1)")
3839
39- table Rating {
40+ table rating {
4041 id: int
41- userId : int
42- movieId : int
42+ userid : int
43+ movieid : int
4344 rating: float
4445 timestamp: int
4546 }
@@ -68,7 +69,7 @@ if (db_type == "snowflake") {
6869 IGNOREHEADER 1;
6970 """)
7071
71- table Rating {
72+ table rating {
7273 id: int // explicit id, to avoid identity type
7374 userid: int
7475 movieid: int
@@ -80,69 +81,91 @@ if (db_type == "snowflake") {
8081 INSERT INTO rating(id, userid, movieid, rating, timestamp)
8182 SELECT row_number() over (order by userid, movieid, timestamp) AS id, userid, movieid, rating, timestamp FROM tmp_rating
8283 """)
84+ } else if (db_type == "mssql") {
85+ run_sql("drop table if exists tmp_rating")
86+ run_sql("create table tmp_rating(userid int, movieid int, rating float, timestamp int)")
87+ table tmp_rating {...}
88+ print "Loading ratings CSV"
89+ run_sql("BULK INSERT tmp_rating from 'ml-25m/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);")
90+ print "Populating actual table"
91+ rating += tmp_rating
92+ commit()
8393} else {
8494 print "Importing ratings CSV"
8595
86- table Rating {
87- userId : int
88- movieId : int
96+ table rating {
97+ userid : int
98+ movieid : int
8999 rating: float
90100 timestamp: int
91101 }
92- import_csv(Rating, 'ml-25m/ratings.csv', true)
93- Rating.add_index("id")
102+ import_csv(rating, 'ml-25m/ratings.csv', true)
103+ rating.add_index("id", true)
104+ rating.add_index("timestamp")
105+ run_sql("CREATE INDEX index_rating_id_timestamp ON rating (id, timestamp)")
94106}
95107
96108run_sql("DROP TABLE IF EXISTS tmp_rating")
97109commit()
98110
99- middle = count(Rating ) /~ 2
111+ middle = count(rating ) /~ 2
100112
101113// Code notes:
102114// - We use 'const table' to avoid updating the ids
103115
104- // Rating_del1 = Delete middle row
105- print "Create Rating_del1"
106- const table Rating_del1 = Rating
107- Rating_del1.add_index("id")
108- Rating_del1[middle..(middle+1)] delete [true]
109- assert count(Rating) == count(Rating_del1) + 1
110-
111- // Rating_update1 = Update middle row
112- print "Create Rating_update1"
113- const table Rating_update1 = Rating
114- Rating_update1.add_index("id")
115- Rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
116-
117- // Rating_<>p = Percentile of rows changed
118- print "Create percentile tables"
119- const table Rating_update001p = Rating
120- const table Rating_update1p = Rating
121- const table Rating_del1p = Rating
122- const table Rating_update50p = Rating
123-
124- Rating_update001p.add_index("id")
125- Rating_update1p.add_index("id")
126- Rating_del1p.add_index("id")
127- Rating_update50p.add_index("id")
116+ print "Create tables"
117+ const table rating_del1 = rating
118+ const table rating_update1 = rating
119+ const table rating_update001p = rating
120+ const table rating_update1p = rating
121+ const table rating_del1p = rating
122+ const table rating_update50p = rating
123+
124+ print "Create indexes"
125+ if (db_type != "redshift" or db_type != "snowflake") {
126+ rating_del1.add_index("id", true)
127+ rating_del1.add_index("timestamp")
128+ run_sql("CREATE INDEX index_rating_del1_id_timestamp ON rating_del1 (id, timestamp)")
129+ rating_update1.add_index("id", true)
130+ rating_update1.add_index("timestamp")
131+ run_sql("CREATE INDEX index_rating_update1_id_timestamp ON rating_update1 (id, timestamp)")
132+ rating_update001p.add_index("id", true)
133+ rating_update001p.add_index("timestamp")
134+ run_sql("CREATE INDEX index_rating_update001p_id_timestamp ON rating_update001p (id, timestamp)")
135+ rating_update1p.add_index("id", true)
136+ rating_update1p.add_index("timestamp")
137+ run_sql("CREATE INDEX index_rating_update1p_id_timestamp ON rating_update1p (id, timestamp)")
138+ rating_del1p.add_index("id", true)
139+ rating_del1p.add_index("timestamp")
140+ run_sql("CREATE INDEX index_rating_del1p_id_timestamp ON rating_del1p (id, timestamp)")
141+ rating_update50p.add_index("id", true)
142+ rating_update50p.add_index("timestamp")
143+ run_sql("CREATE INDEX index_rating_update50p_id_timestamp ON rating_update50p (id, timestamp)")
144+ commit()
145+ }
146+
147+ print "Alter tables"
148+ rating_del1[middle..(middle+1)] delete [true]
149+ assert count(rating) == count(rating_del1) + 1
150+ rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
128151
129152if (db_type == "postgres" or db_type == "redshift") {
130- run_sql('UPDATE "Rating_update001p" SET " timestamp" = (" timestamp" + 1) WHERE random() < 0.0001')
131- run_sql('UPDATE "Rating_update1p" SET " timestamp" = (" timestamp" + 1) WHERE random() < 0.01')
132- run_sql('DELETE FROM "Rating_del1p" WHERE random() < 0.01')
133- run_sql('UPDATE "Rating_update50p" SET " timestamp" = (" timestamp" + 1) WHERE random() < 0.5')
134- } else if (db_type == "mysql") {
135- run_sql('UPDATE Rating_update001p SET timestamp = (timestamp + 1) WHERE rand() < 0.0001')
136- run_sql('UPDATE Rating_update1p SET timestamp = (timestamp + 1) WHERE rand() < 0.01')
137- run_sql('DELETE FROM Rating_del1p WHERE rand() < 0.01')
138- run_sql('UPDATE Rating_update50p SET timestamp = (timestamp + 1) WHERE rand() < 0.5')
153+ run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE random() < 0.0001')
154+ run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE random() < 0.01')
155+ run_sql('DELETE FROM rating_del1p WHERE random() < 0.01')
156+ run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE random() < 0.5')
157+ } else if (db_type == "mysql" or db_type == "mssql" ) {
158+ run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE rand() < 0.0001')
159+ run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE rand() < 0.01')
160+ run_sql('DELETE FROM rating_del1p WHERE rand() < 0.01')
161+ run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE rand() < 0.5')
139162} else if (db_type == "snowflake") {
140- run_sql('UPDATE Rating_update001p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.0001')
141- run_sql('UPDATE Rating_update1p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.01')
142- run_sql('DELETE FROM Rating_del1p WHERE uniform(0::float, 1, random()) < 0.01')
143- run_sql('UPDATE Rating_update50p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.5')
163+ run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.0001')
164+ run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.01')
165+ run_sql('DELETE FROM rating_del1p WHERE uniform(0::float, 1, random()) < 0.01')
166+ run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.5')
144167} else {
145168 print "Unsupported database: " + db_type
146169}
147170
148- commit()
171+ commit()
0 commit comments