Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit f10c31d

Browse files
authored
Merge pull request #63 from datafold/docs
Documentation at readthedocs
2 parents 31736f6 + e08726a commit f10c31d

File tree

9 files changed

+408
-22
lines changed

9 files changed

+408
-22
lines changed

data_diff/database.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,16 @@ def select_table_schema(self, path: DbPath) -> str:
132132
"Provide SQL for selecting the table schema as (name, type, date_prec, num_prec)"
133133
...
134134

135+
@abstractmethod
136+
def query_table_schema(self, path: DbPath) -> Dict[str, ColType]:
137+
"Query the table for its schema for table in 'path', and return {column: type}"
138+
...
139+
140+
@abstractmethod
141+
def parse_table_name(self, name: str) -> DbPath:
142+
"Parse the given table name into a DbPath"
143+
...
144+
135145
@abstractmethod
136146
def close(self):
137147
"Close connection(s) to the database instance. Querying will stop functioning."
@@ -157,13 +167,16 @@ class Database(AbstractDatabase):
157167
"""Base abstract class for databases.
158168
159169
Used for providing connection code and implementation specific SQL utilities.
170+
171+
Instanciated using :meth:`~data_diff.connect_to_uri`
160172
"""
161173

162174
DATETIME_TYPES = NotImplemented
163175
default_schema = NotImplemented
164176

165177
def query(self, sql_ast: SqlOrStr, res_type: type):
166-
"Query the given SQL AST, and attempt to convert the result to type 'res_type'"
178+
"Query the given SQL code/AST, and attempt to convert the result to type 'res_type'"
179+
167180
compiler = Compiler(self)
168181
sql_code = compiler.compile(sql_ast)
169182
logger.debug("Running SQL (%s): %s", type(self).__name__, sql_code)
@@ -224,9 +237,9 @@ def query_table_schema(self, path: DbPath) -> Dict[str, ColType]:
224237
# Return a dict of form {name: type} after canonizaation
225238
return {row[0]: self._parse_type(*row[1:]) for row in rows}
226239

227-
@lru_cache()
228-
def get_table_schema(self, path: DbPath) -> Dict[str, ColType]:
229-
return self.query_table_schema(path)
240+
# @lru_cache()
241+
# def get_table_schema(self, path: DbPath) -> Dict[str, ColType]:
242+
# return self.query_table_schema(path)
230243

231244
def _normalize_table_path(self, path: DbPath) -> DbPath:
232245
if len(path) == 1:
@@ -707,6 +720,12 @@ def connect_to_uri(db_uri: str, thread_count: Optional[int] = 1) -> Database:
707720
thread_count determines the max number of worker threads per database,
708721
if relevant. None means no limit.
709722
723+
Parameters:
724+
db_uri (str): The URI for the database to connect
725+
thread_count (int, optional): Size of the threadpool. Ignored by cloud databases. (default: 1)
726+
727+
Note: For non-cloud databases, a low thread-pool size may be a performance bottleneck.
728+
710729
Supported databases:
711730
- postgres
712731
- mysql

data_diff/diff_tables.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -58,26 +58,33 @@ def __len__(self):
5858

5959
@dataclass(frozen=False)
6060
class TableSegment:
61-
"""Signifies a segment of rows (and selected columns) within a table"""
61+
"""Signifies a segment of rows (and selected columns) within a table
62+
63+
Parameters:
64+
database (Database): Database instance. See :meth:`connect_to_uri`
65+
table_path (:data:`DbPath`): Path to table in form of a tuple. e.g. `('my_dataset', 'table_name')`
66+
key_column (str): Name of the key column, which uniquely identifies each row (usually id)
67+
update_column (str, optional): Name of updated column, which signals that rows changed (usually updated_at or last_update)
68+
extra_columns (Tuple[str, ...], optional): Extra columns to compare
69+
min_key (:data:`DbKey`, optional): Lowest key_column value, used to restrict the segment
70+
max_key (:data:`DbKey`, optional): Highest key_column value, used to restrict the segment
71+
min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
72+
max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
73+
74+
"""
6275

6376
# Location of table
6477
database: Database
6578
table_path: DbPath
6679

67-
# Name of the key column, which uniquely identifies each row (usually id)
80+
# Columns
6881
key_column: str
69-
70-
# Name of updated column, which signals that rows changed (usually updated_at or last_update)
7182
update_column: str = None
72-
73-
# Extra columns to compare
7483
extra_columns: Tuple[str, ...] = ()
7584

76-
# Start/end key_column values, used to restrict the segment
85+
# Restrict the segment
7786
min_key: DbKey = None
7887
max_key: DbKey = None
79-
80-
# Start/end update_column values, used to restrict the segment
8188
min_update: DbTime = None
8289
max_update: DbTime = None
8390

@@ -251,19 +258,18 @@ class TableDiffer:
251258
bisection search recursively to find the differences efficiently.
252259
253260
Works best for comparing tables that are mostly the name, with minor discrepencies.
261+
262+
Parameters:
263+
bisection_factor (int): Into how many segments to bisect per iteration.
264+
bisection_threshold (int): When should we stop bisecting and compare locally (in row count).
265+
threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
266+
max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto. Only relevant when `threaded` is ``True``.
267+
There may be many pools, so number of actual threads can be a lot higher.
254268
"""
255269

256-
# Into how many segments to bisect per iteration
257270
bisection_factor: int = DEFAULT_BISECTION_FACTOR
258-
259-
# When should we stop bisecting and compare locally (in row count)
260271
bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD
261-
262-
# Enable/disable threaded diffing. Needed to take advantage of database threads.
263272
threaded: bool = True
264-
265-
# Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
266-
# There may be many pools, so number of actual threads can be a lot higher.
267273
max_threadpool_size: Optional[int] = 1
268274

269275
# Enable/disable debug prints
@@ -274,7 +280,12 @@ class TableDiffer:
274280
def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
275281
"""Diff the given tables.
276282
277-
Returned value is an iterator that yield pair-tuples, representing the diff. Items can be either
283+
Parameters:
284+
table1 (TableSegment): The "before" table to compare. Or: source table
285+
table2 (TableSegment): The "after" table to compare. Or: target table
286+
287+
Returns:
288+
An iterator that yield pair-tuples, representing the diff. Items can be either
278289
('+', columns) for items in table1 but not in table2
279290
('-', columns) for items in table2 but not in table1
280291
Where `columns` is a tuple of values for the involved columns, i.e. (id, ...extra)

docs/Makefile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Minimal makefile for Sphinx documentation
2+
#
3+
4+
# You can set these variables from the command line.
5+
SPHINXOPTS =
6+
SPHINXBUILD = sphinx-build
7+
SPHINXPROJ = data-diff
8+
SOURCEDIR = .
9+
BUILDDIR = _build
10+
11+
# Put it first so that "make" without argument is like "make help".
12+
help:
13+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14+
15+
.PHONY: help Makefile
16+
17+
# Catch-all target: route all unknown targets to Sphinx using the new
18+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19+
%: Makefile
20+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

docs/conf.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Documentation build configuration file, created by
5+
# sphinx-quickstart on Sun Aug 16 13:09:41 2020.
6+
#
7+
# This file is execfile()d with the current directory set to its
8+
# containing dir.
9+
#
10+
# Note that not all possible configuration values are present in this
11+
# autogenerated file.
12+
#
13+
# All configuration values have a default; values that are commented out
14+
# serve to show the default.
15+
16+
# If extensions (or modules to document with autodoc) are in another directory,
17+
# add these directories to sys.path here. If the directory is relative to the
18+
# documentation root, use os.path.abspath to make it absolute, like shown here.
19+
#
20+
import os
21+
import sys
22+
23+
sys.path.insert(0, os.path.abspath(".."))
24+
sys.path.append(os.path.abspath("./_ext"))
25+
autodoc_member_order = "bysource"
26+
27+
28+
# -- General configuration ------------------------------------------------
29+
30+
# If your documentation needs a minimal Sphinx version, state it here.
31+
#
32+
# needs_sphinx = '1.0'
33+
34+
# Add any Sphinx extension module names here, as strings. They can be
35+
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36+
# ones.
37+
extensions = [
38+
"sphinx.ext.autodoc",
39+
"sphinx.ext.napoleon",
40+
"sphinx.ext.coverage",
41+
"recommonmark",
42+
"sphinx_markdown_tables",
43+
"sphinx_copybutton",
44+
# 'sphinx_gallery.gen_gallery'
45+
]
46+
47+
# Add any paths that contain templates here, relative to this directory.
48+
templates_path = ["_templates"]
49+
50+
# The suffix(es) of source filenames.
51+
# You can specify multiple suffix as a list of string:
52+
#
53+
# source_suffix = ['.rst', '.md']
54+
source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
55+
56+
57+
# The master toctree document.
58+
master_doc = "index"
59+
60+
# General information about the project.
61+
project = "data-diff"
62+
copyright = "Datafold"
63+
author = "Erez Shinan"
64+
65+
# The version info for the project you're documenting, acts as replacement for
66+
# |version| and |release|, also used in various other places throughout the
67+
# built documents.
68+
#
69+
# The short X.Y version.
70+
version = ""
71+
# The full version, including alpha/beta/rc tags.
72+
release = ""
73+
74+
# The language for content autogenerated by Sphinx. Refer to documentation
75+
# for a list of supported languages.
76+
#
77+
# This is also used if you do content translation via gettext catalogs.
78+
# Usually you set "language" from the command line for these cases.
79+
language = None
80+
81+
# List of patterns, relative to source directory, that match files and
82+
# directories to ignore when looking for source files.
83+
# This patterns also effect to html_static_path and html_extra_path
84+
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
85+
86+
# The name of the Pygments (syntax highlighting) style to use.
87+
pygments_style = "sphinx"
88+
89+
# If true, `todo` and `todoList` produce output, else they produce nothing.
90+
todo_include_todos = False
91+
92+
93+
# -- Options for HTML output ----------------------------------------------
94+
95+
# The theme to use for HTML and HTML Help pages. See the documentation for
96+
# a list of builtin themes.
97+
#
98+
html_theme = "sphinx_rtd_theme"
99+
100+
# Theme options are theme-specific and customize the look and feel of a theme
101+
# further. For a list of options available for each theme, see the
102+
# documentation.
103+
#
104+
# html_theme_options = {}
105+
106+
# Add any paths that contain custom static files (such as style sheets) here,
107+
# relative to this directory. They are copied after the builtin static files,
108+
# so a file named "default.css" will overwrite the builtin "default.css".
109+
html_static_path = ["_static"]
110+
111+
html_css_files = [
112+
"custom.css",
113+
]
114+
115+
# Custom sidebar templates, must be a dictionary that maps document names
116+
# to template names.
117+
#
118+
# This is required for the alabaster theme
119+
# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
120+
html_sidebars = {
121+
"**": [
122+
"relations.html", # needs 'show_related': True theme option to display
123+
"searchbox.html",
124+
]
125+
}
126+
127+
128+
# -- Options for HTMLHelp output ------------------------------------------
129+
130+
# Output file base name for HTML help builder.
131+
htmlhelp_basename = "datadiffdoc"
132+
133+
134+
# -- Options for LaTeX output ---------------------------------------------
135+
136+
latex_elements = {
137+
# The paper size ('letterpaper' or 'a4paper').
138+
#
139+
# 'papersize': 'letterpaper',
140+
# The font size ('10pt', '11pt' or '12pt').
141+
#
142+
# 'pointsize': '10pt',
143+
# Additional stuff for the LaTeX preamble.
144+
#
145+
# 'preamble': '',
146+
# Latex figure (float) alignment
147+
#
148+
# 'figure_align': 'htbp',
149+
}
150+
151+
# Grouping the document tree into LaTeX files. List of tuples
152+
# (source start file, target name, title,
153+
# author, documentclass [howto, manual, or own class]).
154+
latex_documents = [
155+
(master_doc, "Datadiff.tex", "Datadiff Documentation", "Erez Shinan", "manual"),
156+
]
157+
158+
159+
# -- Options for manual page output ---------------------------------------
160+
161+
# One entry per manual page. List of tuples
162+
# (source start file, name, description, authors, manual section).
163+
man_pages = [(master_doc, "Datadiff", "Datadiff Documentation", [author], 1)]
164+
165+
166+
# -- Options for Texinfo output -------------------------------------------
167+
168+
# Grouping the document tree into Texinfo files. List of tuples
169+
# (source start file, target name, title, author,
170+
# dir menu entry, description, category)
171+
texinfo_documents = [
172+
(
173+
master_doc,
174+
"Datadiff",
175+
"Datadiff Documentation",
176+
author,
177+
"Datadiff",
178+
"One line description of project.",
179+
"Miscellaneous",
180+
),
181+
]
182+
183+
# -- Sphinx gallery config -------------------------------------------
184+
185+
# sphinx_gallery_conf = {
186+
# 'examples_dirs': ['../examples'],
187+
# 'gallery_dirs': ['examples'],
188+
# }

0 commit comments

Comments
 (0)