@@ -6,34 +6,18 @@ func run_sql(code) {
66
77// Cleanup
88func cleanup() {
9- run_sql("DROP TABLE IF EXISTS Rating ")
9+ run_sql("DROP TABLE IF EXISTS rating ")
1010 run_sql("DROP TABLE IF EXISTS tmp_rating")
11- run_sql("DROP TABLE IF EXISTS Rating_del1 ")
12- run_sql("DROP TABLE IF EXISTS Rating_update1 ")
13- run_sql("DROP TABLE IF EXISTS Rating_update001p ")
14- run_sql("DROP TABLE IF EXISTS Rating_update1p ")
15- run_sql("DROP TABLE IF EXISTS Rating_del1p ")
16- run_sql("DROP TABLE IF EXISTS Rating_update50p ")
11+ run_sql("DROP TABLE IF EXISTS rating_del1 ")
12+ run_sql("DROP TABLE IF EXISTS rating_update1 ")
13+ run_sql("DROP TABLE IF EXISTS rating_update001p ")
14+ run_sql("DROP TABLE IF EXISTS rating_update1p ")
15+ run_sql("DROP TABLE IF EXISTS rating_del1p ")
16+ run_sql("DROP TABLE IF EXISTS rating_update50p ")
1717 commit()
1818}
1919
20- func cleanup_double_quote() {
21- run_sql("DROP TABLE IF EXISTS \"Rating\"")
22- run_sql("DROP TABLE IF EXISTS \"tmp_rating\"")
23- run_sql("DROP TABLE IF EXISTS \"Rating_del1\"")
24- run_sql("DROP TABLE IF EXISTS \"Rating_update1\"")
25- run_sql("DROP TABLE IF EXISTS \"Rating_update001p\"")
26- run_sql("DROP TABLE IF EXISTS \"Rating_update1p\"")
27- run_sql("DROP TABLE IF EXISTS \"Rating_del1p\"")
28- run_sql("DROP TABLE IF EXISTS \"Rating_update50p\"")
29- commit()
30- }
31-
32- if (db_type == "postgres" or db_type == "redshift") {
33- cleanup_double_quote()
34- } else {
35- cleanup()
36- }
20+ cleanup()
3721
3822// Import CSV
3923if (db_type == "snowflake") {
@@ -46,17 +30,17 @@ if (db_type == "snowflake") {
4630
4731 table tmp_rating {
4832 id: int
49- movieId : int
33+ movieid : int
5034 rating: float
5135 timestamp: int
5236 }
5337
5438 run_sql("COPY INTO tmp_rating FROM '@~/ratings.csv.gz' file_format=(skip_header=1)")
5539
56- table Rating {
40+ table rating {
5741 id: int
58- userId : int
59- movieId : int
42+ userid : int
43+ movieid : int
6044 rating: float
6145 timestamp: int
6246 }
@@ -85,7 +69,7 @@ if (db_type == "snowflake") {
8569 IGNOREHEADER 1;
8670 """)
8771
88- table Rating {
72+ table rating {
8973 id: int // explicit id, to avoid identity type
9074 userid: int
9175 movieid: int
@@ -97,69 +81,91 @@ if (db_type == "snowflake") {
9781 INSERT INTO rating(id, userid, movieid, rating, timestamp)
9882 SELECT row_number() over (order by userid, movieid, timestamp) AS id, userid, movieid, rating, timestamp FROM tmp_rating
9983 """)
84+ } else if (db_type == "mssql") {
85+ run_sql("drop table if exists tmp_rating")
86+ run_sql("create table tmp_rating(userid int, movieid int, rating float, timestamp int)")
87+ table tmp_rating {...}
88+ print "Loading ratings CSV"
89+ run_sql("BULK INSERT tmp_rating from 'ml-25m/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);")
90+ print "Populating actual table"
91+ rating += tmp_rating
92+ commit()
10093} else {
10194 print "Importing ratings CSV"
10295
103- table Rating {
104- userId : int
105- movieId : int
96+ table rating {
97+ userid : int
98+ movieid : int
10699 rating: float
107100 timestamp: int
108101 }
109- import_csv(Rating, 'ml-25m/ratings.csv', true)
110- Rating.add_index("id")
102+ import_csv(rating, 'ml-25m/ratings.csv', true)
103+ rating.add_index("id", true)
104+ rating.add_index("timestamp")
105+ run_sql("CREATE INDEX index_rating_id_timestamp ON rating (id, timestamp)")
111106}
112107
113108run_sql("DROP TABLE IF EXISTS tmp_rating")
114109commit()
115110
116- middle = count(Rating ) /~ 2
111+ middle = count(rating ) /~ 2
117112
118113// Code notes:
119114// - We use 'const table' to avoid updating the ids
120115
121- // Rating_del1 = Delete middle row
122- print "Create Rating_del1"
123- const table Rating_del1 = Rating
124- Rating_del1.add_index("id")
125- Rating_del1[middle..(middle+1)] delete [true]
126- assert count(Rating) == count(Rating_del1) + 1
127-
128- // Rating_update1 = Update middle row
129- print "Create Rating_update1"
130- const table Rating_update1 = Rating
131- Rating_update1.add_index("id")
132- Rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
133-
134- // Rating_<>p = Percentile of rows changed
135- print "Create percentile tables"
136- const table Rating_update001p = Rating
137- const table Rating_update1p = Rating
138- const table Rating_del1p = Rating
139- const table Rating_update50p = Rating
140-
141- Rating_update001p.add_index("id")
142- Rating_update1p.add_index("id")
143- Rating_del1p.add_index("id")
144- Rating_update50p.add_index("id")
116+ print "Create tables"
117+ const table rating_del1 = rating
118+ const table rating_update1 = rating
119+ const table rating_update001p = rating
120+ const table rating_update1p = rating
121+ const table rating_del1p = rating
122+ const table rating_update50p = rating
123+
124+ print "Create indexes"
125+ if (db_type != "redshift" or db_type != "snowflake") {
126+ rating_del1.add_index("id", true)
127+ rating_del1.add_index("timestamp")
128+ run_sql("CREATE INDEX index_rating_del1_id_timestamp ON rating_del1 (id, timestamp)")
129+ rating_update1.add_index("id", true)
130+ rating_update1.add_index("timestamp")
131+ run_sql("CREATE INDEX index_rating_update1_id_timestamp ON rating_update1 (id, timestamp)")
132+ rating_update001p.add_index("id", true)
133+ rating_update001p.add_index("timestamp")
134+ run_sql("CREATE INDEX index_rating_update001p_id_timestamp ON rating_update001p (id, timestamp)")
135+ rating_update1p.add_index("id", true)
136+ rating_update1p.add_index("timestamp")
137+ run_sql("CREATE INDEX index_rating_update1p_id_timestamp ON rating_update1p (id, timestamp)")
138+ rating_del1p.add_index("id", true)
139+ rating_del1p.add_index("timestamp")
140+ run_sql("CREATE INDEX index_rating_del1p_id_timestamp ON rating_del1p (id, timestamp)")
141+ rating_update50p.add_index("id", true)
142+ rating_update50p.add_index("timestamp")
143+ run_sql("CREATE INDEX index_rating_update50p_id_timestamp ON rating_update50p (id, timestamp)")
144+ commit()
145+ }
146+
147+ print "Alter tables"
148+ rating_del1[middle..(middle+1)] delete [true]
149+ assert count(rating) == count(rating_del1) + 1
150+ rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
145151
146152if (db_type == "postgres" or db_type == "redshift") {
147- run_sql('UPDATE "Rating_update001p" SET " timestamp" = (" timestamp" + 1) WHERE random() < 0.0001')
148- run_sql('UPDATE "Rating_update1p" SET " timestamp" = (" timestamp" + 1) WHERE random() < 0.01')
149- run_sql('DELETE FROM "Rating_del1p" WHERE random() < 0.01')
150- run_sql('UPDATE "Rating_update50p" SET " timestamp" = (" timestamp" + 1) WHERE random() < 0.5')
151- } else if (db_type == "mysql") {
152- run_sql('UPDATE Rating_update001p SET timestamp = (timestamp + 1) WHERE rand() < 0.0001')
153- run_sql('UPDATE Rating_update1p SET timestamp = (timestamp + 1) WHERE rand() < 0.01')
154- run_sql('DELETE FROM Rating_del1p WHERE rand() < 0.01')
155- run_sql('UPDATE Rating_update50p SET timestamp = (timestamp + 1) WHERE rand() < 0.5')
153+ run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE random() < 0.0001')
154+ run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE random() < 0.01')
155+ run_sql('DELETE FROM rating_del1p WHERE random() < 0.01')
156+ run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE random() < 0.5')
157+ } else if (db_type == "mysql" or db_type == "mssql" ) {
158+ run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE rand() < 0.0001')
159+ run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE rand() < 0.01')
160+ run_sql('DELETE FROM rating_del1p WHERE rand() < 0.01')
161+ run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE rand() < 0.5')
156162} else if (db_type == "snowflake") {
157- run_sql('UPDATE Rating_update001p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.0001')
158- run_sql('UPDATE Rating_update1p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.01')
159- run_sql('DELETE FROM Rating_del1p WHERE uniform(0::float, 1, random()) < 0.01')
160- run_sql('UPDATE Rating_update50p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.5')
163+ run_sql('UPDATE rating_update001p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.0001')
164+ run_sql('UPDATE rating_update1p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.01')
165+ run_sql('DELETE FROM rating_del1p WHERE uniform(0::float, 1, random()) < 0.01')
166+ run_sql('UPDATE rating_update50p SET timestamp = (timestamp + 1) WHERE uniform(0::float, 1, random()) < 0.5')
161167} else {
162168 print "Unsupported database: " + db_type
163169}
164170
165- commit()
171+ commit()
0 commit comments