Skip to content

Commit 5cc0bd9

Browse files
authored
Merge pull request #1 from avojak/deduplicate-games
Deduplicate games
2 parents 57b49e3 + f2c4596 commit 5cc0bd9

File tree

7 files changed

+115
-30
lines changed

7 files changed

+115
-30
lines changed

README.md

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
<img src="https://img.shields.io/badge/size-37%20MB-blue"></img>
2-
<img src="https://img.shields.io/badge/compressed%20size-13%20MB-blue"></img>
1+
<img src="https://img.shields.io/badge/size-35%20MB-blue"></img>
2+
<img src="https://img.shields.io/badge/compressed%20size-14%20MB-blue"></img>
33

44
# SQLite Libretro DB
55

@@ -10,7 +10,8 @@ the same content* from the Libretro RetroArch database in a single SQLite databa
1010

1111
***Important note:*** The conversion tool here also does some basic deconfliction when there are multiple records for the same ROM MD5 checksum.
1212
The underlying assumption is that if two ROMs have the same checksum, they're the same, and the metadata should be merged in favor of non-null
13-
values. The primary use-case is for client applications to be able to query the database by MD5 checksum of a ROM file, so keep in mind that this mindset informed the database schema and how the utility decides which data is duplicated.
13+
values. The primary use-case is for client applications to be able to query the database by MD5 checksum of a ROM file, so keep in mind that
14+
this mindset informed the database schema and how the utility decides which data is duplicated.
1415

1516
## Usage
1617

@@ -37,8 +38,10 @@ build/libretrodb.sqlite.tgz
3738
| ------ | --------- |
3839
| id | INTEGER PRIMARY KEY |
3940
| serial_id | TEXT |
40-
| rom_id | INTEGER |
4141
| developer_id | INTEGER |
42+
| publisher_id | INTEGER |
43+
| rating_id | INTEGER |
44+
| users | INTEGER |
4245
| franchise_id | INTEGER |
4346
| release_year | INTEGER |
4447
| release_month | INTEGER |
@@ -54,6 +57,7 @@ build/libretrodb.sqlite.tgz
5457
| Column | Data Type |
5558
| ------ | --------- |
5659
| id | INTEGER PRIMARY KEY |
60+
| serial_id | INTEGER |
5761
| name | TEXT |
5862
| md5 | TEXT |
5963

@@ -64,6 +68,20 @@ build/libretrodb.sqlite.tgz
6468
| id | INTEGER PRIMARY KEY |
6569
| name | TEXT |
6670

71+
### `publishers`
72+
73+
| Column | Data Type |
74+
| ------ | --------- |
75+
| id | INTEGER PRIMARY KEY |
76+
| name | TEXT |
77+
78+
### `ratings`
79+
80+
| Column | Data Type |
81+
| ------ | --------- |
82+
| id | INTEGER PRIMARY KEY |
83+
| name | TEXT |
84+
6785
### `franchises`
6886

6987
| Column | Data Type |
@@ -108,7 +126,10 @@ SELECT games.serial_id,
108126
games.release_year,
109127
games.release_month,
110128
games.display_name,
129+
games.users,
111130
developers.name as developer_name,
131+
publishers.name as publisher_name,
132+
ratings.name as rating_name,
112133
franchises.name as franchise_name,
113134
regions.name as region_name,
114135
genres.name as genre_name,
@@ -119,11 +140,13 @@ SELECT games.serial_id,
119140
FROM games
120141
LEFT JOIN developers ON games.developer_id = developers.id
121142
LEFT JOIN franchises ON games.franchise_id = franchises.id
143+
LEFT JOIN publishers ON games.publisher_id = publishers.id
144+
LEFT JOIN ratings ON games.rating_id = ratings.id
122145
LEFT JOIN genres ON games.genre_id = genres.id
123146
LEFT JOIN platforms ON games.platform_id = platforms.id
124147
LEFT JOIN manufacturers ON platforms.manufacturer_id = manufacturers.id
125148
LEFT JOIN regions ON games.region_id = regions.id
126-
INNER JOIN roms ON games.rom_id = roms.id
149+
INNER JOIN roms ON games.serial_id = roms.serial_id
127150
WHERE roms.md5 = "27F322F5CD535297AB21BC4A41CBFC12";
128151
```
129152

main.py

Lines changed: 68 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,23 @@ def __init__(self, name, manufacturer_id):
3434

3535
class ROM:
3636

37-
def __init__(self, name, md5):
37+
def __init__(self, serial, name, md5):
3838
self.id = None
39+
self.serial = serial
3940
self.name = name
4041
self.md5 = md5
4142

4243
class Game:
4344

44-
def __init__(self, display_name, full_name, serial, rom, developer_id, franchise_id, release_year, release_month, region_id, genre_id, platform_id):
45+
def __init__(self, display_name, full_name, serial, developer_id, publisher_id, rating_id, users, franchise_id, release_year, release_month, region_id, genre_id, platform_id):
4546
self.id = None
4647
self.display_name = display_name
4748
self.full_name = full_name
4849
self.serial = serial
49-
self.rom = rom
5050
self.developer_id = developer_id
51+
self.publisher_id = publisher_id
52+
self.rating_id = rating_id
53+
self.users = users
5154
self.franchise_id = franchise_id
5255
self.release_year = release_year
5356
self.release_month = release_month
@@ -59,7 +62,6 @@ def __init__(self, display_name, full_name, serial, rom, developer_id, franchise
5962
Merge other game into self by deferring to non-null fields.
6063
"""
6164
def join(self, other):
62-
# Join the top-level fields
6365
if self.display_name is None and other.display_name is not None:
6466
self.display_name = other.display_name
6567
if self.full_name is None and other.full_name is not None:
@@ -68,6 +70,12 @@ def join(self, other):
6870
self.serial = other.serial
6971
if self.developer_id is None and other.developer_id is not None:
7072
self.developer_id = other.developer_id
73+
if self.publisher_id is None and other.publisher_id is not None:
74+
self.publisher_id = other.publisher_id
75+
if self.rating_id is None and other.rating_id is not None:
76+
self.rating_id = other.rating_id
77+
if self.users is None and other.users is not None:
78+
self.users = other.users
7179
if self.franchise_id is None and other.franchise_id is not None:
7280
self.franchise_id = other.franchise_id
7381
if self.release_year is None and other.release_year is not None:
@@ -80,13 +88,6 @@ def join(self, other):
8088
self.genre_id = other.genre_id
8189
if self.platform_id is None and other.platform_id is not None:
8290
self.platform_id = other.platform_id
83-
# Join the ROM
84-
if self.rom is None and other.rom is not None:
85-
self.rom = other.rom
86-
if self.rom.name is None and other.rom.name is not None:
87-
self.rom.name = other.rom.name
88-
if self.rom.md5 is None and other.rom.md5 is not None:
89-
self.rom.md5 = other.rom.md5
9091

9192
class Converter:
9293

@@ -97,12 +98,15 @@ def __init__(self, rdb_dir, output_file, libretrodb_tool):
9798
self.libretrodb_tool = self._validate_libretrodb_tool(libretrodb_tool)
9899
# Create storage for parsed data prior to insertion in the database
99100
self.developers = dict()
101+
self.publishers = dict()
102+
self.ratings = dict()
100103
self.franchises = dict()
101104
self.regions = dict()
102105
self.genres = dict()
103106
self.platforms = dict()
104107
self.manufacturers = dict()
105108
self.games = dict()
109+
self.roms = dict()
106110

107111
"""
108112
Ensure that the provided directory of .rdb files exists.
@@ -174,12 +178,15 @@ def run(self):
174178

175179
# Insert data into the database
176180
self._insert_developers(cursor)
181+
self._insert_publishers(cursor)
182+
self._insert_ratings(cursor)
177183
self._insert_franchises(cursor)
178184
self._insert_genres(cursor)
179185
self._insert_manufacturers(cursor)
180186
self._insert_platforms(cursor)
181187
self._insert_regions(cursor)
182188
self._insert_games(cursor)
189+
self._insert_roms(cursor)
183190

184191
# Commit changes to the database
185192
connection.commit()
@@ -234,6 +241,9 @@ def _parse_line(self, json_str, platform_id):
234241
serial = self._get_json_value(json_obj, 'serial')
235242
md5 = self._get_json_value(json_obj, 'md5')
236243
developer = self._get_json_value(json_obj, 'developer')
244+
publisher = self._get_json_value(json_obj, 'publisher')
245+
rating = self._get_json_value(json_obj, 'esrb_rating')
246+
users = self._get_json_value(json_obj, 'users')
237247
franchise = self._get_json_value(json_obj, 'franchise')
238248
release_year = self._get_json_value(json_obj, 'releaseyear')
239249
release_month = self._get_json_value(json_obj, 'releasemonth')
@@ -253,6 +263,10 @@ def _parse_line(self, json_str, platform_id):
253263
# Save potentially common references to developers, franchises, regions and genres, and assign an ID
254264
if developer is not None and developer not in self.developers:
255265
self.developers[developer] = len(self.developers) + 1
266+
if publisher is not None and publisher not in self.publishers:
267+
self.publishers[publisher] = len(self.publishers) + 1
268+
if rating is not None and rating not in self.ratings:
269+
self.ratings[rating] = len(self.ratings) + 1
256270
if franchise is not None and franchise not in self.franchises:
257271
self.franchises[franchise] = len(self.franchises) + 1
258272
if region is not None and region not in self.regions:
@@ -261,20 +275,25 @@ def _parse_line(self, json_str, platform_id):
261275
self.genres[genre] = len(self.genres) + 1
262276

263277
developer_id = self.developers[developer] if developer is not None else None
278+
publisher_id = self.publishers[publisher] if publisher is not None else None
279+
rating_id = self.ratings[rating] if rating is not None else None
264280
franchise_id = self.franchises[franchise] if franchise is not None else None
265281
region_id = self.regions[region] if region is not None else None
266282
genre_id = self.genres[genre] if genre is not None else None
267283

268284
# Build the ROM and Game objects. Note that ROMs and games should be 1:1.
269-
rom = ROM(rom_name, md5)
270-
game = Game(display_name, full_name, serial, rom, developer_id, franchise_id, release_year, release_month, region_id, genre_id, platform_id)
271-
if md5 in self.games:
272-
self.games[md5].join(game)
285+
rom = ROM(serial, rom_name, md5)
286+
rom_id = len(self.roms) + 1
287+
rom.id = rom_id
288+
self.roms[rom_id] = rom
289+
290+
game = Game(display_name, full_name, serial, developer_id, publisher_id, rating_id, users, franchise_id, release_year, release_month, region_id, genre_id, platform_id)
291+
if serial in self.games:
292+
self.games[serial].join(game)
273293
else:
274294
id = len(self.games) + 1
275-
self.games[md5] = game
276-
self.games[md5].id = id
277-
self.games[md5].rom.id = id
295+
game.id = id
296+
self.games[serial] = game
278297

279298
"""
280299
Insert the manufacturers into the database.
@@ -302,6 +321,24 @@ def _insert_developers(self, cursor):
302321
cursor.execute(self._load_sql("./sql/insert_developer.sql"), (id, name))
303322
self.logger.success("Inserted {} developers into database".format(len(self.developers)))
304323

324+
"""
325+
Insert the publishers into the database.
326+
"""
327+
def _insert_publishers(self, cursor):
328+
for key,value in self.publishers.items():
329+
(id, name) = (value, key)
330+
cursor.execute(self._load_sql("./sql/insert_publisher.sql"), (id, name))
331+
self.logger.success("Inserted {} publishers into database".format(len(self.publishers)))
332+
333+
"""
334+
Insert the ratings into the database.
335+
"""
336+
def _insert_ratings(self, cursor):
337+
for key,value in self.ratings.items():
338+
(id, name) = (value, key)
339+
cursor.execute(self._load_sql("./sql/insert_rating.sql"), (id, name))
340+
self.logger.success("Inserted {} ratings into database".format(len(self.ratings)))
341+
305342
"""
306343
Insert the franchises into the database.
307344
"""
@@ -330,17 +367,18 @@ def _insert_genres(self, cursor):
330367
self.logger.success("Inserted {} genres into database".format(len(self.genres)))
331368

332369
"""
333-
Insert the ROMs and games into the database.
370+
Insert the games into the database.
334371
"""
335372
def _insert_games(self, cursor):
336373
for key,value in self.games.items():
337374
game = value
338-
cursor.execute(self._load_sql("./sql/insert_rom.sql"), (game.rom.id, game.rom.name, game.rom.md5))
339375
cursor.execute(self._load_sql("./sql/insert_game.sql"), (
340376
game.id,
341377
game.serial,
342-
game.rom.id,
343378
game.developer_id,
379+
game.publisher_id,
380+
game.rating_id,
381+
game.users,
344382
game.franchise_id,
345383
game.release_year,
346384
game.release_month,
@@ -351,6 +389,15 @@ def _insert_games(self, cursor):
351389
game.platform_id))
352390
self.logger.success("Inserted {} games into database".format(len(self.games)))
353391

392+
"""
393+
Insert the ROMs into the database.
394+
"""
395+
def _insert_roms(self, cursor):
396+
for key,value in self.roms.items():
397+
rom = value
398+
cursor.execute(self._load_sql("./sql/insert_rom.sql"), (rom.id, rom.serial, rom.name, rom.md5))
399+
self.logger.success("Inserted {} ROMs into database".format(len(self.roms)))
400+
354401
def _get_json_value(self, json_obj, key):
355402
return json_obj[key] if key in json_obj else None
356403

sql/create_tables.sql

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
CREATE TABLE games (
22
id INTEGER PRIMARY KEY,
33
serial_id TEXT,
4-
rom_id INTEGER,
54
developer_id INTEGER,
5+
publisher_id INTEGER,
6+
rating_id INTEGER,
7+
users INTEGER,
68
franchise_id INTEGER,
79
release_year INTEGER,
810
release_month INTEGER,
@@ -15,13 +17,22 @@ CREATE TABLE games (
1517
);
1618
CREATE TABLE roms (
1719
id INTEGER PRIMARY KEY,
20+
serial_id TEXT,
1821
name TEXT,
1922
md5 TEXT
2023
);
2124
CREATE TABLE developers (
2225
id INTEGER PRIMARY KEY,
2326
name TEXT
2427
);
28+
CREATE TABLE publishers (
29+
id INTEGER PRIMARY KEY,
30+
name TEXT
31+
);
32+
CREATE TABLE ratings (
33+
id INTEGER PRIMARY KEY,
34+
name TEXT
35+
);
2536
CREATE TABLE franchises (
2637
id INTEGER PRIMARY KEY,
2738
name TEXT

sql/insert_game.sql

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
INSERT INTO games(
22
id,
33
serial_id,
4-
rom_id,
54
developer_id,
5+
publisher_id,
6+
rating_id,
7+
users,
68
franchise_id,
79
release_year,
810
release_month,
@@ -12,4 +14,4 @@ INSERT INTO games(
1214
full_name,
1315
-- boxart_url,
1416
platform_id
15-
) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)
17+
) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)

sql/insert_publisher.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
INSERT INTO publishers(id,name) VALUES(?,?)

sql/insert_rating.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
INSERT INTO ratings(id,name) VALUES(?,?)

sql/insert_rom.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
INSERT INTO roms(id,name,md5) VALUES(?,?,?)
1+
INSERT INTO roms(id,serial_id,name,md5) VALUES(?,?,?,?)

0 commit comments

Comments
 (0)