Skip to content

Commit b52f1f9

Browse files
authored
Merge pull request #5 from Wikia/football-metadata
Extract even more data from football.wikia.com
2 parents a7e55b9 + 15166a1 commit b52f1f9

26 files changed

+575
-9424
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,6 @@ redisgraph/data/
109109
# graphs
110110
*.png
111111
*.svg
112+
113+
# cache
114+
grapher/grapher/scripts/.cache/*.json

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
redis:
2+
redis-cli -p 56379

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
version: '2'
22
services:
3-
sphinx:
3+
db:
44
image: redislabs/redisgraph:1.0.8
55
ports:
66
- "127.0.0.1:56379:6379" # bind to local interface only!

grapher/.pylintrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
[MESSAGES CONTROL]
2-
disable=fixme,too-few-public-methods,useless-super-delegation,useless-object-inheritance
2+
disable=fixme,pointless-string-statement,too-few-public-methods,useless-super-delegation,useless-object-inheritance,wrong-import-order

grapher/grapher/graph/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""
2+
Graph storage abstraction layer
3+
"""
4+
from .redis import RedisGraph

grapher/grapher/graph/base.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""
2+
Base class
3+
"""
4+
import logging
5+
6+
7+
class BaseGraph(object):
8+
"""
9+
Represents a collection of nodes (model objects) and relations (edges) between them
10+
"""
11+
def __init__(self):
12+
self.models = list()
13+
self.logger = logging.getLogger(self.__class__.__name__)
14+
15+
def add(self, model):
16+
"""
17+
:type model grapher.models.BaseModel
18+
"""
19+
self.models.append(model)
20+
21+
def store(self, graph_name):
22+
"""
23+
Save a given graph
24+
"""
25+
raise NotImplementedError('store methods needs to be implemented')

grapher/grapher/graph/redis.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
"""
2+
Redis storage
3+
"""
4+
import redis
5+
from redisgraph import Node, Edge, Graph
6+
from .base import BaseGraph
7+
8+
9+
class RedisGraph(BaseGraph):
10+
"""
11+
Handles storing a collection of models in RedisGraph
12+
"""
13+
def __init__(self, host, port=6379):
14+
"""
15+
:type host str
16+
:type port int
17+
"""
18+
super(RedisGraph, self).__init__()
19+
20+
self.host = host
21+
self.port = port
22+
23+
self.logger.info('Using redis: %s:%d', host, port)
24+
25+
@staticmethod
26+
def encode_properties(properties):
27+
"""
28+
:type properties dict
29+
:rtype: dict
30+
"""
31+
ret = dict()
32+
33+
# redisgraph library does not encode quotes
34+
# (John_Faxe_Jensen:Person{name:\"John \"Faxe\" Jensen\",birthDate:1965,height:1.78})
35+
for key, value in properties.items():
36+
ret[key] = value.replace('"', '\\"') if isinstance(value, str) else value
37+
38+
return ret
39+
40+
@classmethod
41+
def model_to_node(cls, model):
42+
"""
43+
:type model grapher.models.BaseModel
44+
:rtype: Node
45+
"""
46+
properties = dict(name=model.get_name())
47+
properties.update(model.properties)
48+
49+
return Node(
50+
alias=model.get_node_name(),
51+
properties=cls.encode_properties(properties) if properties else None,
52+
)
53+
54+
@classmethod
55+
def model_to_edges(cls, model):
56+
"""
57+
:type model grapher.models.BaseModel
58+
:rtype: list[Edge]
59+
"""
60+
for (relation, target, properties) in model.get_all_relations():
61+
# Edge(john, 'visited', japan, properties={'purpose': 'pleasure'})
62+
yield Edge(
63+
src_node=Node(alias=model.get_node_name()),
64+
relation=relation,
65+
dest_node=Node(alias=target),
66+
properties=cls.encode_properties(properties) if properties else None
67+
)
68+
69+
def _get_graph(self, graph_name):
70+
"""
71+
:type graph_name str
72+
:rtype: Graph
73+
"""
74+
# https://github.com/RedisLabs/redisgraph-py#example-using-the-python-client
75+
redis_graph = Graph(
76+
name=graph_name,
77+
redis_con=redis.Redis(self.host, self.port)
78+
)
79+
80+
# add all nodes
81+
for model in self.models:
82+
redis_graph.add_node(self.model_to_node(model))
83+
84+
# and now add edges
85+
for model in self.models:
86+
for edge in self.model_to_edges(model):
87+
try:
88+
# add target node if needed
89+
# we may want to refer to a node that was not indexed above
90+
# e.g. English player in a Spanish club
91+
if edge.dest_node.alias not in redis_graph.nodes:
92+
node = Node(
93+
alias=edge.dest_node.alias,
94+
properties={'name': str(edge.dest_node.alias).split(':')[0]}
95+
)
96+
redis_graph.add_node(node)
97+
self.logger.info('Adding missing node: %s', edge.dest_node.alias)
98+
99+
redis_graph.add_edge(edge)
100+
except KeyError:
101+
print(model)
102+
# graph can be not complete, some nodes can be missing despite the relation
103+
self.logger.error('add_edge failed', exc_info=True)
104+
105+
# assert valid nodes
106+
# for _, node in redis_graph.nodes.items():
107+
# print(node.alias, node.properties)
108+
# print(str(node))
109+
return redis_graph
110+
111+
def dump(self, graph_name):
112+
"""
113+
Return a redisgraph command that would create a graph
114+
115+
:type graph_name str
116+
:rtype: str
117+
"""
118+
redis_graph = self._get_graph(graph_name)
119+
120+
# https://oss.redislabs.com/redisgraph/#with-redis-cli
121+
# copied from redisgraph/client.py (commit function)
122+
query = ''
123+
124+
for _, node in redis_graph.nodes.items():
125+
query += str(node) + ','
126+
127+
for edge in redis_graph.edges:
128+
query += str(edge) + ','
129+
130+
# Discard leading comma.
131+
if query[-1] == ',':
132+
query = query[:-1]
133+
134+
# encode "
135+
query = query.replace('"', '\\"')
136+
137+
return 'GRAPH.QUERY {name} "CREATE {graph}"'.format(name=graph_name, graph=query)
138+
139+
def store(self, graph_name):
140+
"""
141+
Store the graph in Redis
142+
143+
:type graph_name str
144+
"""
145+
redis_graph = self._get_graph(graph_name)
146+
147+
# and save it
148+
self.logger.info('Committing graph with %d nodes and %s edges',
149+
len(redis_graph.nodes), len(redis_graph.edges))
150+
151+
try:
152+
redis_graph.delete()
153+
except redis.exceptions.ResponseError as ex:
154+
# Graph was not found in database.
155+
self.logger.info(ex)
156+
157+
redis_graph.commit()
158+
159+
redis_graph.redis_con.execute_command('SAvE')
160+
self.logger.info('Committed and saved')

grapher/grapher/models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
Schema.org representations
33
"""
44
from .base import BaseModel
5-
from .person import PersonModel
5+
from .football import PersonModel, SportsTeamModel

grapher/grapher/models/base.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Base model
33
"""
44
import json
5+
import re
56
from collections import OrderedDict
67

78

@@ -10,9 +11,11 @@ class BaseModel(object):
1011
Base schema.org model for keeping metadata
1112
"""
1213
def __init__(self, model_type, name):
14+
assert name is not None, 'name of a model cannot be None'
15+
1316
self.type = model_type
1417
self.name = name
15-
self.properties = OrderedDict()
18+
self.properties = OrderedDict(name=name)
1619
self.relations = list()
1720

1821
def get_type(self):
@@ -27,11 +30,36 @@ def get_name(self):
2730
"""
2831
return self.name
2932

30-
def get_node_name(self):
33+
@staticmethod
34+
def encode_name(name):
3135
"""
36+
:type name str
3237
:rtype: str
3338
"""
34-
return '{}:{}'.format(self.get_type(), self.get_name())
39+
# remove UTF characters
40+
name = name.encode('ascii', 'ignore').decode('ascii')
41+
42+
# Must begin with an alphabetic letter
43+
# Can contain numbers, but not as the first character
44+
# Cannot contain symbols (an exception to this rule is using underscore)
45+
#
46+
# https://neo4j.com/docs/cypher-manual/current/syntax/naming/
47+
name = re.sub(r'^\d+', '', name) # remove digits from the beginning of the string
48+
return re.sub(r'[^a-z0-9]+', '_', name, flags=re.IGNORECASE).strip('_')
49+
50+
def get_node_name(self):
51+
"""
52+
Return node name for using in Cypher queries, e.g. "Foo:Type"
53+
54+
:rtype: str|None
55+
"""
56+
if self.get_name() is None:
57+
return None
58+
59+
return '{}:{}'.format(
60+
self.encode_name(self.get_name()),
61+
self.encode_name(self.get_type())
62+
)
3563

3664
def add_property(self, key, value):
3765
"""
@@ -54,6 +82,10 @@ def add_relation(self, relation, target, properties=None):
5482
:type target str
5583
:type properties dict
5684
"""
85+
# remove None values from properties
86+
if properties:
87+
properties = {k: v for k, v in properties.items() if v is not None}
88+
5789
self.relations.append((relation, target, properties))
5890

5991
def get_relation_targets(self, relation_type):

grapher/grapher/models/football.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""
2+
Football (and other sports) related models
3+
"""
4+
from . import BaseModel
5+
6+
7+
class PersonModel(BaseModel):
8+
"""
9+
Person model
10+
"""
11+
def __init__(self, name):
12+
# https://schema.org/Person
13+
super(PersonModel, self).__init__(model_type='Person', name=name)
14+
15+
16+
class SportsTeamModel(BaseModel):
17+
"""
18+
F.C. Model
19+
"""
20+
def __init__(self, name):
21+
# https://schema.org/SportsTeam
22+
super(SportsTeamModel, self).__init__(model_type='SportsTeam', name=name)

0 commit comments

Comments
 (0)