Skip to content

The goal of PythonUniverse is to collect and organize Python skills in one place. We've collected some Python techniques, Python modules, and some libraries for data science. Each Topic includes code snippet and further explanation.

License

Notifications You must be signed in to change notification settings

windsuzu/PythonUniverse

Folders and files

NameName
Last commit message
Last commit date

Latest commit

Β 

History

60 Commits
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 

Repository files navigation

Table of contents

Must KnowClassesFunctions
CollectionsItertoolsFunctools
StringIntSetTuple
ConditionalFor-LoopTry-ExceptDesignIpython
Built-ins
NumpyPandasMatplotlib (Pyplot)
Seaborn

Must Know

List & Dict & Set Comprehensions

[(i, j) for i in range(3) for j in range(3) if i > j] # [(1, 0), (2, 0), (2, 1)]

Lambda Functions

li = [1, 2, 3] li = [*map(lambda x: x * 10, li)] #li = [10, 20, 30]

Map

num1 = [100, 1, 20] num2 = [19, 4, 94] num3 = [40, 6, 30] [*map(lambda x, y, z: max(x, y, z), num1, num2, num3)] # [100, 6, 94]

Filter

names = ['Liam', 'Olivia', 'Noah', 'Emma', 'Oliver', 'Ava'] choice = filter(lambda x: x.startswith('O'), names) print(*choice, sep=', ') # Olivia, Oliver

Zip

a = [1, 2, 3] b = [4, 5, 6] c = [*zip(a, b)] # [(1, 4), (2, 5), (3, 6)] a, b = zip(*c) # a=(1, 2, 3), b=(4, 5, 6)

*args & **kwargs

Defining Functions with *arg and **kwarg

def example(a, *arg, b=0, **kwarg): print(a) # 1 print(arg) # (2, 3) print(b) # 1 print(kwarg) # {'x': 'a', 'y': [1, 2, 3]} example(1, 2, 3, b=1, x='a', y=[1, 2, 3])

Calling Functions with *arg and **kwarg

def func(greet, time, name): print(greet, time, name) func(*["Good", "Morning"], **{"name": "Jay"}) # Good Morning Jay

Unpack Variables

Unpacking Iterable

a, b, *_ = [1, 2, 3, 4, 5] # 1, 2, [3, 4, 5]

Unpacking Generator

first, *amid, last = map(lambda x: x**2, range(1, 10000)) first # 1 last # 99980001

Unpacking in For-loop

sales = [("Pencil", 0.22, 1500), ("Notebook", 1.30, 550)] for product, *_ in sales: print(product) # Pencil, Notebook

Unpacking Function

def compute(i): return i, i ** 2, i ** 3, i ** 4, i ** 5 num, power, cube, *_ = compute(3) power # 9 cube # 27

Combining Dicts

number = {"one": 1, "two": 2} letter = {"a": "A", "b": "B"} combine = {**number, **letter} combine # {'one': 1, 'two': 2, 'a': 'A', 'b': 'B'}

Generator (map, filter, zip)

def square_it(value): for i in range(value): yield i**2 li = square_it(10_000_000) [i for i in li if i < 50] # [0, 1, 4, 9, 16, 25, 36, 49]

Closure & Decorator

def count_decorator(count): # new decorator with argument def decorator(orig_func): def wrapper(*args, **kwargs): print(f"func name: {orig_func.__name__}") print(f"func args: {args}, {kwargs}") for _ in range(count): # use the argument orig_func(*args, **kwargs) return wrapper return decorator # return the original decorator @count_decorator(2) def greet(msg): print(msg) greet("hello") # func name: greet # func args: ('hello',), {} # hello # hello

Context Manager

@contextmanager def enterFolder(folderName): home = os.getcwd() os.chdir(folderName) yield os.chdir(home) with enterFolder('folder1'), open('example1.txt', 'w') as f: f.write('file1')

Magic Method

class BinaryInt(str): def __new__(cls, val): return str.__new__(cls, f"{val: b}") def __add__(self, val): val += int(self, 2) return f"{val:b}" a = BinaryInt(2) print(a) # 10 print(a + 4) # 110

Metaclasses

class Meta(type): def __new__(mtcls, name, bases, attrs): if name != "Base" and "must_to_do" not in attrs: raise TypeError("Bad Class: must_to_do() is needed") return super().__new__(mtcls, name, bases, attrs) class Base(metaclass=Meta): def server_func(self): return self.must_to_do() class Derived(Base): ... # TypeError: Bad Class: must_to_do() is needed

Threading & Multiprocessing

import concurrent.futures with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: futures = [executor.submit(load_url, url, 60) for url in URLS] for future in concurrent.futures.as_completed(futures): result = future.result() print(len(result)) with concurrent.futures.ProcessPoolExecutor() as executor: results = executor.map(load_url, URLS, [60] * len(URLS), chunksize=4) for result in results: print(len(result))

Classes

self (class instance)

class Person: def __init__(self, name): self.name = name def say(self): return f"I'm {self.name}" p = Person("Jay") p.say() == Person.say(p) # True

variables (class & instance)

class Employee: num_emp = 0 # Class variable def __init__(self, pay): self.pay = pay # Instance variable Employee.num_emp += 1 e1 = Employee(100) e2 = Employee(200) e1.num_emp # 2 Employee.num_emp # 2 e1.pay # 100 Employee.pay # AttributeError: type object 'Employee' has no attribute 'pay'

method vs. classmethod vs. staticmethod

class Person: def __init__(self, name, age): self.name = name self.age = age @staticmethod def splitPersonString(string, split_sign="-"): return string.split(split_sign) @classmethod def fromString(cls, cls_str): return cls(*cls.splitPersonString(cls_str, ", ")) p1 = Person.fromString("Jay, 99") p1.name # Jay p1.age # 99

_ (private) vs. __ (name mangling)

class Dog: _weight = 5 # private variable def __bark(self): # name mangling fucntion print("bark") dog = Dog() dog._weight # 5 dog.__bark() # AttributeError: 'Dog' object has no attribute '__bark' dog._Dog__bark() # bark

@property (getter, setter)

class User: def __init__(self, first_name, last_name, password): self.first_name = first_name self.last_name = last_name self.password = password @property def fullname(self): return f"{self.first_name} {self.last_name}" @property def password(self): raise AttributeError("password is not readable.") @password.setter def password(self, passord): from hashlib import md5 self.password_hash = md5(b"{password}").hexdigest() user = User("Mimi", "Wang", "0000") user.fullname # Mimi Wang user.password_hash # 7fbccc9c3a9a5afef65563cd00404c1416 user.password # Attribute Error: password is not readable.

LEGB (local, enclosing, global, builtins)

min([1, 2, 31]) # builtins min min = "global min" def outer(): # we can do "global min" here to change global min = "enclosing min" def inner(): # we can do "nonlocal min" here to change enclosing min = "local min"

Abstract class

from abc import ABC, abstractmethod class Base(ABC, object): @property @abstractmethod def foo(self): ... @abstractmethod def do(self): ...

Dataclasses

from dataclasses import InitVar, dataclass, field from typing import List @dataclass class InventoryItem: name: str unit_price: float = field(default=0.0) quantity_on_hand: int = field(default=0, repr=False) parts: List[str] = field(default_factory=list) parts_number: InitVar[int] = 0 def __post_init__(self, parts_number): self.parts.extend([f"part{i}" for i in range(1, parts_number + 1)]) item = InventoryItem("product", parts_number=2) # InventoryItem (name = 'product', unit_price=0.0, parts=['part1', 'part2'])

Classes in Dynamic Language

def getClass(x): if x == 1: for i in range(11): class Example: a = i return Example cls = getClass(1) cls.b = "123" print(cls.a, cls.b) # 10 123

Functions

Enclosing function

def add_with_b(b): def add(a): return a + b return add add4 = add_with_b(4) add4(3) # 7 add4(7) # 11

Attrs

class Cat: def __repr__(self): return f"({self.name}: {self.age})" listOfCats = [] attrs = [{"name": "meow1", "age": 5}, {"name": "meow2", "age": 10}] for attr in attrs: cat = Cat() for key, val in attr.items(): setattr(cat, key, val) listOfCats.append(cat) print(listOfCats) # [(meow1: 5), (meow2: 10)]

Functions in Dynamic Language

for i in range(100): def say(): print(i) def returnFunc(a): if a < 100: def mul(b): print(a * b) return mul else: def add(b): print(a + b) return add

Collections

defaultdict

from collections import defaultdict d = defaultdict(list) d["a"] = [1, 2, 3] d["b"].append(4) d["c"].extend([5, 6]) # defaultdict(<class 'list'>, {'a': [1, 2, 3], 'b': [4], 'c': [5, 6]})

OrderedDict

from collections import OrderedDict location = ["C", "B", "A"] population = [32, 46, 12] d = OrderedDict({l: p for l, p in zip(location, population)}) # OrderedDict([('C', 32), ('B', 46), ('A', 12)]) d["D"] = 44 # OrderedDict([('C', 32), ('B', 46), ('A', 12), ('D', 44)]) d.popitem(last=False) # OrderedDict([('B', 46), ('A', 12), ('D', 44)]) d.move_to_end("D", last=False) # OrderedDict ([( 'D', 44), ('B', 46), ('A', 12)])

Counter

from collections import Counter c = Counter(cats=4, dogs=8) # Counter({'dogs': 8, 'cats': 4}) c.update(birds=10) # Counter({'birds': 10, 'dogs': 8, 'cats': 4}) c = c - Counter({"birds": 5}) # Counter({'dogs': 8, 'birds': 5, 'cats': 4}) c.most_common(2) # [('dogs', 8), ('birds', 5)]

namedtuple

from collections import namedtuple Dog = namedtuple("Dog", "name, age") d1 = Dog("funny", 4) features = ["happy", 3] d2 = Dog._make(features) # Dog(name='happy', age=3) d2._asdict() # OrderedDict([('name', 'happy'), ('age', 3)])

deque

from collections import deque li = [40, 30, 50, 46, 39, 44] d = deque(li[:2]) # Let 's compute the moving average with range=3 d.appendleft(0) s = sum(d) for elem in li[2:]: s += elem - d.popleft() d.append(elem) print(s / 3) # 40, 42, 45, 43

Itertools

Infinite iterators

count

from itertools import count gen = count(2.5, 0.5) for x in gen: print(x) # 2.5, 3.0, 3.5, 4.0, ... non-stop

cycle

from itertools import cycle gen = cycle([1, 2, 3]) for x in gen: print(x) # 1, 2, 3, 1, 2, ... non-stop

repeat

from itertools import repeat class Cat: ... gen = repeat(Cat(), 2) for cat in gen: print(cat) # <__main__.Cat object at 0x0000019AC1C5D348> # <__main__.Cat object at 0x0000019AC1C5D348>

Iterators terminating on the shortest input sequence

accumulate

import operator from itertools import accumulate gen = accumulate([1, 2, 3, 4]) list(gen) # [1, 3, 6, 10] gen = accumulate([1, 2, 3, 4], func=operator.mul) list(gen) # [1, 2, 6, 24]

chain

from itertools import chain gen = chain([1, 2], [3, 4]) list(gen) # [1, 2, 3, 4] gen = chain("AB", "CD") list(gen) # [A, B, C, D]

compress

from itertools import compress gen = compress([1, 2, 3], [1, 0, 1]) gen = compress([1, 2, 3], [True, False, True]) # same list(gen) # [1, 3]

filterfalse

from itertools import filterfalse gen = filterfalse(lambda x: x%2 == 0, [1, 2, 3]) list(gen) # [1, 3]

groupby

from itertools import groupby gen = groupby("AABBCCCAA") # default func = lambda x: x for k, g in gen: print(k, list(g)) # A [A, A] # B [B, B] # C [C, C, C] # A [A, A] gen = groupby([1, 2, 3, 4], lambda x: x // 3) for k, g in gen: print(k, list(g)) # 0 [1, 2] # 1 [3, 4] gen = groupby([("A", 100), ("B", 200), ("C", 600)], lambda x: x[1] > 500) for k, g in gen: print(k, list(g)) # False [(A, 100), (B, 200)] # True [(C, 600)]

islice

gen = islice([1, 2, 3], 2) # equals to A[:2] list(gen) # [1, 2] gen = islice("ABCD", 2, 4) # equals to A[2:4] list(gen) # [C, D] gen = islice("ABCD", 0, None, 2) # equals to A[::2] list(gen) # [A, C]

starmap

from itertools import starmap # with only one argument gen = starmap(lambda x: x.lower(), "ABCD") list(gen) # [a, b, c, d] # with 2 arguments gen = starmap(lambda x, y: x + y, [(1, 2), (3, 4)]) list(gen) # [3, 7] # with different size of arugments gen = starmap(lambda *keys: sum(keys) / len(keys), [[3, 8, 3], [4, 2]]) list(gen) # [4.6666667, 3.0]

takewhile

from itertools import takewhile gen = takewhile(lambda x: x < 2, [1, 2, 3, 2, 1]) list(gen) # [1] gen = takewhile(lambda x: x.isupper(), "ABCdefgHIJ") list(gen) # [A, B, C]

dropwhile

gen = dropwhile(lambda x: x < 2, [1, 2, 3, 2, 1]) list(gen) # [2, 3, 2, 1] gen = dropwhile(lambda x: x.isupper(), "ABCdefgHIJ") list(gen) # [d, e, f, g, H, I, J]

zip_longest

from itertools import zip_longest gen = zip_longest("ABC", ("X", "Y")) list(gen) # [('A', 'X'), ('B', 'Y'), ('C', None)] gen = zip_longest("ABC", [1, 2], fillvalue=-1) list(gen) # [('A', 1), ('B', 2), ('C', -1)]

Combinatoric iterators

product

from itertools import product gen = product("AB", "CD") list(gen) # [AC, AD, BC, BD] gen = product("AB", repeat=2) list(gen) # [AA, AB, BA, BB] gen = product("AB", "CD", repeat=2) list(gen) # [ACAC, ACAD, ACBC, ACBD, # ADAC, ADAD, ADBC, ADBD, # BCAC, BCAD, BCBC, BCBD, # BDAC, BDAD, BDBC, BDBD]

permutations

gen = permutations("ABC") # same as r=3 list(gen) # [ABC, ACB, BAC, BCA, CAB, CBA] gen = permutations("ABC", r=2) list(gen) # [AB, AC, BA, BC, CA, CB] gen = permutations("ABC", r=1) list(gen) # [A, B, C]

combinations

gen = combinations("ABC", 1) list(gen) # [A, B, C] gen = combinations("ABC", 2) list(gen) # [AB, AC, BC] gen = combinations("ABC", 3) list(gen) # [ABC]

combinations_with_replacement

gen = combinations_with_replacement("ABC", 1) list(gen) # [A, B, C] gen = combinations_with_replacement("ABC", 2) list(gen) # [AA, AB, AC,  # BB, BC,  # CC] gen = combinations_with_replacement("ABC", 3) list(gen) # [AAA, AAB, AAC, ABB, ABC, ACC, # BBB, BBC, BCC, # CCC]

Functools

Reduce

from functools import reduce reduce(lambda x, y: x - y, [1, 2, 3, 4, 5], 100) # 85

String

f-string

first_name = "Kain" last_name = "Mccarthy" print(f"Hi, I'm {first_name} {last_name}.") # Hi, I'm Kain Mccarthy. pi = 3.14159265359 print(f"{pi:.2f}") # 3.14 d = {"name": "Shelly"} print(f"She is {d['name']}") # She is Shelly i = 1000000 print(f"{i:,}") # 1,000,000 # Ref: # * https://youtu.be/nghuHvKLhJA # * https://blog.louie.lu/2017/08/08/outdate-python-string-format-and-fstring/

Int

Underscore Placeholders

a = 100_000_000 b = 10_000_000 c = 1_0_0 print(f"{a+b+c:,}") # 110,000,100 # Ref: # * https://youtu.be/C-gEQdGVXbk&t=140

Set

Search

long_list = [i for i in range(100_000_000)] long_set = set(long_list) %%time 100_000_000 in long_list # False # Wall time: 1.26 s %%time 100_000_000 in long_set # False # Wall time: 0 ns # Ref: # * https://stackoverflow.com/questions/2831212/python-sets-vs-lists/17945009 # * https://youtu.be/r3R3h5ly_8g?t=1010

Tuple

Swap

a, b = 1, 2 a # 1 b # 2 a, b = b, a a # 2 b # 1 # Ref: # * https://youtu.be/VBokjWj_cEA?list=LL&t=445

Condition

Ternary operator

if x < 1: x += 1 else: x -= 1 # equivalent to: x = (x + 1) if (x < 1) else (x - 1) # Ref: # * https://www.youtube.com/watch?v=C-gEQdGVXbk&t=34s

For-Loop

Enumerate

arr = ["a", "b", "c"] for index, element in enumerate(arr): print(index, element) # 0 a # 1 b # 2 c for index, element in enumerate(arr, start=3): print(index, element) # 3 a # 4 b # 5 c # Ref # * https://youtu.be/VBokjWj_cEA?list=LL&t=190

For-Else

for text in "to be or not to be".split(): if text.strip().startswith("o"): print(f"Found it! `{text}`") break else: print("Not found") # Found it! `or` # Ref: # * https://www.youtube.com/watch?v=Dh-0lAyc3Bc

Try-Except

TEEF

try: print(1/1) except Exception as e: print(e) else: print("Safe") # executed when except didn't happen finally: print("Done") # Always executed # 1.0 # Safe # Done # Ref: # * https://youtu.be/VBokjWj_cEA?list=LL&t=1331

Design

Annotation

def func(a: str, b: int = 3) -> str: return a*b func.__annotations__ # {'a': <class 'str'>, 'b': <class 'int'>, 'return': <class 'str'>} func("hi") # hihihi func("hi", 5) # hihihihihi
def func(a: "str longer than 5", b: 1+2 = 3) -> "str longer b times": return a*b func.__annotations__ # {'a': 'str longer than 5', 'b': 3, 'return': 'str longer b times'} func("hi") # hihihi func("ohayou", 2) # ohayouohayou

Ref

Typing

from typing import Any, Dict, Iterable, List, Union def func(a: List[int], b: Union[str, int], c: Dict[str, int], d: Iterable, e: Any): print(len(a)) print(f"{b} can be str or int.") print(f"{c['something']} will return int.") for i in d: print(i) print(f"{type(e)} can be any type.") # Ref: # * https://myapollo.com.tw/zh-tw/python-typing-module/

Pass and Ellipsis

# Style 1 def my_abstract_method(self): pass # Style 2 def my_abstract_method(self): ... # Style 3 def my_abstract_method(self): """  This function is ...  """ # Ref: # * https://stackoverflow.com/questions/55274977/when-is-the-usage-of-the-python-ellipsis-to-be-preferred-over-pass # * https://stackoverflow.com/questions/772124/what-does-the-ellipsis-object-do

IPython

VSCode Python Interactive window

#%% 1+1 # 2 # Ref: # * https://code.visualstudio.com/docs/python/jupyter-support-py

Time Measure

One Line

%time sleep(0.3) # Wall time: 310 ms %timeit sleep(0.3) # 311 ms Β± 2.06 ms per loop (mean Β± std. dev. of 7 runs, 1 loop each)

Multiple Lines

%%time for i in range(10): sleep(0.1) # Wall time: 1.09 s %%timeit for i in range(10): sleep(0.1) # 1.09 s Β± 2.07 ms per loop (mean Β± std. dev. of 7 runs, 1 loop each)

Ref

Memory Measure

Installation

!pip install -U memory_profiler %load_ext memory_profiler

One Line

%memit [i for i in range(1000)] # peak memory: 51.31 MiB, increment: 0.36 MiB

Multiple Lines

%%memit l = [] for x in range(10000): l.append(x*2) # peak memory: 52.76 MiB, increment: 0.70 MiB

Ref

Modules

pathlib

sub_folder = Path("subfolder/subfolder") sub_folder.mkdir(parents=True, exist_ok=True) file_ = sub_folder / Path("test.txt") file_.touch() file_.write_text("Hello") file_.read_text() file_.unlink() Path("subfolder/subfolder").rmdir()

Numpy

Create Array or Matrix

np.array([[1, 2], [3, 4], [5, 6]]) # create from list np.zeros((3, 3)) # create filled with 0's np.ones((2, 4, 4)) # create filled with 1's np.empty((5, 2)) # create with speed np.arange(2, 10, 3) # create array from range (start, end, step_size) np.linspace(5, 50, 20) # create a linear space (start, end, num_elements) # create from random generator rng = np.random.default_rng(seed=42) rng.random((2, 4)) rng.normal(3, 2.5, size=(2, 4)) # sample from N(3, 6.25) rng.integers(low=2, high=10, size=(10, 2)) # random integer matrix

Basic Operations

Sort and Concatenate

np.sort(a, axis=None) np.sort(a, axis=-1)[::-1] a.sort() a[::-1].sort() np.concatenate((a, b), axis=None) np.concatenate((a, b), axis=2)

Element-wise

a = np.arange(5) # [0, 1, 2, 3, 4] b = np.ones(5, dtype=int) # [1, 1, 1, 1, 1] a + b # [1 2 3 4 5] a - b # [-1 0 1 2 3] a ^ 2 # [ 0 1 4 9 16] a * 10 # [ 0 10 20 30 40] a > 2 # [False False False True True] np.sqrt(a) # [0. , 1. , 1.41421356, 1.73205081, 2. ] a*b # [0 1 2 3 4] a@b # 10

All (None) Column-wise (0), Row-wise (1)

A = np.random.default_rng(42).random((2, 4)) # [[0.77395605, 0.43887844, 0.85859792, 0.69736803], # [0.09417735, 0.97562235, 0.7611397 , 0.78606431]]) A.max() # 0.97562235 A.max(axis=0) # [0.77395605, 0.97562235, 0.85859792, 0.78606431] A.max(axis=1) # [0.85859792, 0.97562235] A.mean() # 0.6732255180088094 A.mean(axis=0) # [0.4340667 , 0.7072504 , 0.80986881, 0.74171617] A.mean(axis=1) # [0.69220011, 0.65425093]

Indexing and Slicing

# Index and slicing arrays x[1, 3] == x[1][3] y[1:5:2, ::3] # Indexing arrays x[np.array([0, 1, 2, -1, -2])] y[np.array([1, 2, 3]), 1:4:2] y[np.array([1, 2]), np.array([-1, -1])] # Masking arrays x[x>5] x[(x%2==0) | (x>7)] y[[True]*3 + [False] + [True] + [False], 2::2] # Ellipsis syntax x[-1, ..., 3] # same as x[-1, :, 3] x[:3, ...] # same as x[0:3, :, :] and x[0:3] and x[:3] x[::2, ..., np.array([0, 2])] # same as x[0:5:2, :, np.array([0, 2])]

Shape Manipulation

A = np.array([[[1, 2, 3], [4, 5, 6]], [[4, 6, 8], [2, 1, 6]]]) A.shape # (2, 2, 3) A = A.reshape(3, 2, 2) # (3, 2, 2) A = A[np.newaxis, ...] # (1, 3, 2, 2) A = np.expand_dims(A, axis=4) # (1, 3, 2, 2, 1) A = A.flatten() # (12,) A = A.reshape(2, -1, 2) # (2, 3, 2)

Copying

# shallow copy: values will change on every variable a = np.arange(10).reshape(5, 2) b = a.view() c = a.reshape(-1) d = a[:3, :1] # deep copy: copy and create an entirely new array a = np.arange(10000000) b = a[:100].copy() del a

Broadcasting

# scalar broadcasting a = np.array([1, 2, 3]) a * 3 # [3, 6, 9] # general broadcasting a = np.ones( (8, 1, 6, 1)) b = np.zeros( (7, 1, 5)) (a*b).shape # 8, 7, 6, 5 # outer product a = np.arange(4)[:, np.newaxis] # (4, 1) b = np.array([1, 2, 3]) # (3,) a + b # (4, 3) # [0] + [1, 2, 3] = [1 2 3] # [1] [2 3 4] # [2] [3 4 5] # [3] [4 5 6]

Pandas

Creation and Viewing

# Create Series pd.Series([1, 2, 3, 4, 5]) pd.Series(np.arange(1, 6), index=list("abcde")) pd.Series({"a": 100, "b": 50, "c": 120}) pd.Series("hi", index=list("12345")) # Create DataFrame pd.DataFrame({ "col_1": [1, 2, 3, 4, 5], "col_2": np.arange(1, 6), "col_3": pd.Series(np.arange(1, 7), index=list("abc123")), }, index=list("abcde")) pd.DataFrame( [ {"a": 1, "b": 2}, {"b": 10, "c": 5}, {"a": 55, "b": 489, "c": 32, "d": 590}, ], index=["first", "second", "third"], columns=list("ab") ) pd.DataFrame( np.arange(10).reshape(2, 5), # [[0,1,2,3,4], [5,6,7,8,9]] index=pd.date_range("20200101", periods=2), columns=list("abcde")) # Viewing df.head(2) df.tail(3) df.index df.columns df.to_numpy() df.sort_index() df.sort_values("col_name")

Selection

Single Column Multiple Columns Continuous Columns All Columns
Single Row df.loc[row, column] or
df.at[row, column]
df.loc[row, [column, column]] df.loc[row, column:column] df.loc[row]
Multiple Rows df.loc[[row, row], column] df.loc[[row, row], [column, column]] df.loc[[row, row], column:column] df.loc[[row, row]]
Continuous Rows df.loc[row:row, column] df.loc[row:row, [column, column]] df.loc[row:row, column:column] df[row:row]
All Rows df[column] df[[column, column]] or
df.loc[:, [column, column]]
df.loc[:, column:column] df
df["col1"] df[["col1", "col2"]] df["row1":"row5"] df.loc["row1", "col1"] # df.iloc[0, 0] df.at["row1", "col1"] # df.iat[0, 0] df.loc["row1", ["col1", "col2"]] # df.iloc[0, [0, 1]] df.loc["row1", "col1":"col5"] # df.iloc[0, 0:4] df.loc[["row1", "row2"]] # df.iloc[[0, 1]] df.loc["row1":"row5", "col1"] # df.iloc[0:4, 0] df[(df["col1"] > 18)] df[(df > 6) & (df < 25)] df[df["col1"].isin([10, 15, 0])]
  • df.iloc is same as df.loc but using position.
  • df.iat is same as df.at but using position.
  • Details πŸ”₯

Setting, Deleting, and Handling

# Modify columns df["col1"] += 10 df.loc[:, "col1"] = "bar" df.loc[:, ["col1", "col3"]] = np.arange(12).reshape(6, 2) # Modify single element df.loc["row1", "col1"] = 0 df.iloc[0, 0] = 1 # Modify by boolean indexing df[df < 100] = -df # Append df["total"] = df.sum(axis=1).to_numpy() df["gt"] = df["total"] > 50000 df["foo"] = "bar" # Insert df.insert(0, "col0", df["col2"][:2]) # col_index, col_name, values # Delete column del df["total"] df.drop(columns=["foo"], inplace=True) # same as `df.drop(["foo"], axis=1)` gt50000 = df.pop("gt50000") # Delete row df.drop(["e", "d"], inplace=True) # Handle NaN miss_df.dropna(how='any') miss_df.fillna(value=10000000)

Operations and Apply Functions

# Arithmetic df + df2 df - df.iloc[0] 1 / df # Numpy np.sqrt(df) np.max(df, axis=1) # Built-in df.mean() df.max(axis=1) # Apply df.apply(np.cumsum, axis=1) df.apply(lambda x: x.sum() / x.size) # x means df # Series s.value_counts() s.str.upper() s.str.split("-").str.get(0)

Concat and Merge

# Concat rows pd.concat([df[:3], df.iloc[7:, :2]]) # Merge two DataFrame pd.merge(df, df2, on="name", how="right")

Grouping and Categorical Data Type

# Groupby df.groupby("col_A").sum() df.groupby(["col_A", "col_B"]).max() # Categorical - discrete df["grade"] = df["grade"].astype("category") df["grade"].cat.categories = ["Bad", "Good", "Excellent"] df.sort_values(by="grade") df.groupby("grade").size() # Categorical - continuous df["grade-labels"] = pd.cut(df["score"], bins=range(0, 120, 20), labels=list("EDCBA"))

Other Pandas Tricks

# Rename Columns df.columns = ["col_one", "col_two"] df = df.add_prefix("Xx_") df = df.add_suffix("_xX") df.columns = df.columns.str.replace("Xx", "Oo") df.columns = df.columns.str.replace("xX", "oO") # Reverse Row or Column Order df.loc[::-1].reset_index(drop=True) # reverse rows df.loc[:, ::-1] # reverse columns # Split DataFrame into 2 random subsets sub1 = df.sample(frac=0.75, random_state=42) sub2 = df.drop(sub1.index) sub1.index = sub1.index.sort_values() sub2.index = sub2.index.sort_values() # Filter by Category (or Largest Category) df[df.genre.isin(["A", "D"])] df[~df.genre.isin(["A", "D"])] df[df.genre.isin(df.genre.value_counts().nlargest(1).index)] # Split String into Multiple Columns df[["first", "last"]] = df["name"].str.split(' ', expand=True) df["city"] = df["location"].str.split(", ", expand=True)[0] # Change Display Options (Not Change Data) pd.set_option("display.float_format", "${:.2f}".format) pd.reset_option("display.float_format") # Style a DataFrame style = {"Date": "{:%Y/%m/%d}", "Value": "${:d}", "Volume": "{:,}"} df.style.format(style) \ .hide_index() \ .highlight_max("Value", color="red") \ .highlight_min("Value", color="green") \ .bar("Area", color="orange", align="zero") \ .background_gradient(subset="Volume", cmap="Greens") \ .set_caption("Random Chart")

Matplotlib (Pyplot)

Basic (Single Plot)

import matplotlib.pyplot as plt # with this magic function, we can skip `plt.show()` %matplotlib inline plt.plot(np.sin(np.linspace(0, 10, 100)), "*-b", lw=2, markersize=5, label="sin(x)") plt.plot(np.log(np.arange(100)), c="g", ls="--", marker=".", lw=2, markersize=5, label="log(x)") plt.xlabel("X here") plt.ylabel("Y here") plt.title("sin(x) and log(x)") plt.grid() plt.legend() plt.text(x=70, y=-1, s="hahahaha") plt.annotate("wow \nmax", xy=(16, 1), xytext=(40, 0.9), arrowprops={"facecolor": "orange", "shrink": 0.05}) plt.annotate("wow \nmax again", xy=(78, 1), xytext=(95, 0.9), arrowprops={"facecolor": "red", "shrink": 0.05})

Multiple Figures and Axes

# Object-oriented style fig1, ax = plt.subplots() ax.plot(...) fig2, axs = plt.subplots(2, 1) axs[0].plot(...) axs[1].plot(...) # Pyplot style plt.figure(1) plt.title("Figure 1") plt.figure(2) plt.subplot(311) plt.title("Figure 2") plt.subplot(323) plt.subplot(324) plt.subplot(337) plt.subplot(338) plt.subplot(339)

Line Plots and Filling Area

years = [1.1, 1.3, 1.5, 2.0, 2.2, ...] salary = [39343.00, 46205.00, 37731.00, 43525.00, 39891.00, ...] salary_mean = np.mean(salary) # Line Plots plt.plot(years, salary, marker="o", markersize=5, lw=2, ls="-", ) # Filling Areas plt.fill_between(years, salary, salary_mean, where=(salary > salary_mean), alpha=.4, color="green", edgecolor="black", interpolate=True, label="On Average" )

Time Series

import matplotlib.dates as mdates dates = np.arange(np.datetime64("2021-01-01"), np.datetime64("2021-01-22")) prices = np.random.default_rng(42).normal(500, 30, len(dates)) plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%a, %d %m")) plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) plt.gca().xaxis.set_minor_locator(mdates.DayLocator()) plt.plot_date(dates, prices, ls="solid", c="orange", marker="^", markersize=10) plt.grid() plt.tight_layout()

Scatter Plots

temperature = [14.2, 16.4, 11.9, 15.2, ...] ice_cream_sales = [215, 325, 185, 332, ...] colors = np.array(ice_cream_sales) / np.linalg.norm(ice_cream_sales) plt.scatter(temperature, ice_cream_sales, s=ice_cream_sales, # set the size according to the prices of the ice cream c=colors, # set the colors according to the prices of the ice cream cmap="Greens", # preferred color type edgecolor="black", # the edge color of points lw=0.5, # the edge width of points alpha=.75, ) plt.xlabel("temperature") plt.ylabel("ice cream price") plt.yscale("log") # use log scale on y-axis to handle outliners cbar = plt.colorbar() cbar.set_label("Expensive") plt.tight_layout()

Bar Charts

# Bar Charts ages = [25, 26, 27, 28, 29, ...] salary_all = [38496, 42000, 46752, 49320, 53200, ...] index = np.arange(len(ages)) width = 0.25 plt.bar(index - width, salary_all, width=0.25, label="All Devs") plt.bar(index, salary_py, width=0.25, label="Python") plt.bar(index + width, salary_js, width=0.25, label="JavaScript") plt.xticks(ticks=index, labels=ages) plt.title("Median Salary (USD) by Age") plt.xlabel("Ages") plt.ylabel("Median Salary (USD)") plt.legend() plt.tight_layout() # Horizontal Bar Charts language = ['JavaScript', 'HTML/CSS', 'SQL', 'Python', ...] popularity = [59219, 55466, 47544, 36443, ...] plt.barh(language, popularity) plt.title("Most Popular Languages") plt.xlabel("Number of People Who Use") plt.tight_layout()

Pie Charts

grade = ["A", "B", "C", "D", "E"] number = [10, 18, 23, 8, 5] explode = [0.1, 0, 0, 0, 0] plt.pie(number, labels=grade, shadow=True, autopct="%1.1f%%", pctdistance=0.6, startangle=90, explode=explode ) plt.title("Test Grade") plt.tight_layout()

Histograms

height_stats = np.random.default_rng(42).normal(160, 15, 1000) interval_bin = [120, 130, 140, 150, 160, 170, 180, 190, 200] plt.hist(height_stats, bins=interval_bin, edgecolor="black", lw=1, density=True) # Plot the probability density curve import scipy.stats as ss density = ss.kde.gaussian_kde(height_stats) index = np.arange(120, 200) plt.plot(index, density.evaluate(index), color="pink", lw=3, ls="--", label="Probability Density") # Plot the mean line plt.axvline(np.mean(height_stats), c="orange", lw=5, label="Height Mean") plt.legend() plt.title("Height Stats") plt.xlabel("Heights") plt.ylabel("Probability Density") plt.tight_layout()

Stack Plots

years = [1950, 1960, 1970, 1980, 1990, 2000, 2010, 2018] population_by_continent = { 'africa': [228, 284, 365, 477, 631, 814, 1044, 1275], 'americas': [340, 425, 519, 619, 727, 840, 943, 1006], 'asia': [1394, 1686, 2120, 2625, 3202, 3714, 4169, 4560], 'europe': [220, 253, 276, 295, 310, 303, 294, 293], 'oceania': [12, 15, 19, 22, 26, 31, 36, 39], } y = population_by_continent.values() labels = population_by_continent.keys() colors = ["#96ceb4", "#ffeead", "#ff6f69", "#ffcc5c", "#88d8b0"] plt.style.use("seaborn") plt.stackplot(years, y, labels=labels, colors=colors) plt.legend(loc="upper left") plt.title("World Population") plt.xlabel("Year") plt.ylabel("Population (Millions)") plt.tight_layout()

Image

img = mpimg.imread("https://www.catster.com/wp-content/uploads/1970/01/Am-ShortHair-breed_getty1140883355-768x513.png") plt.imshow(img) # Applying pseudocolor schemes plt.imshow(img[..., 0], cmap="gray") plt.colorbar() # Flipping Photos Vertically or Horizontally plt.imshow(img[::-1]) # Reverse at the first axis == vertical flip plt.imshow(img[:, ::-1]) # Reverse at the second axis == horizontal flip

Styles, Colors, Colormaps

# Switch Style plt.style.use("seaborn-pastel") # Data x = np.random.default_rng(42).integers(0, 100, 100) y = (2*x+1) * np.random.default_rng(43).normal(5, 1, 100) regr = sklearn.linear_model.LinearRegression() regr.fit(x[:, np.newaxis], y[:, np.newaxis]) regr_line = regr.predict(x[:, np.newaxis]) # Plotting with fancy color and colormap plt.scatter(x, y, c=y, alpha=0.25, cmap="plasma") plt.plot(x, regr_line, color="darkviolet", alpha=0.5, lw=5, ls="-", label="regression line") plt.title("Linear Regression Test") plt.xlabel("X") plt.ylabel("y") plt.legend() plt.colorbar()

Seaborn

Basic (Seaborn)

x = np.array(range(1, 5)) y = x**2 df = pd.DataFrame(zip(x, y), columns=["col_1", "col_2"]) # Plotting with data parameter def plot(): sns.lineplot(x="col_1", y="col_2", data=df) # Seaborn Styles sns.set_style("white") # Scaling the plots sns.set_context("paper", font_scale=1.5) # Changing the figure Size plt.figure(figsize=(8, 4)) # width, height  # Using Seaborn with Matplotlib plt.subplot(211) plt.title("Square X") plot() # Seaborn Styles Context Manager with sns.axes_style("darkgrid"): plt.subplot(212) plot() plt.tight_layout()

Color Palette

# Sequential Palette palette = sns.color_palette("YlGn") sns.palplot(palette) plt.title("YlGn Colormap (Sequential)") # Diverging Palette palette = sns.color_palette("coolwarm") sns.palplot(palette) plt.title("coolwarm Colormap (Diverging)") # Qualitative Palette palette = sns.color_palette("Pastel2") sns.palplot(palette) plt.title("Pastel2 Colormap (Qualitative)")



Multiple Plots

Using Matplotlib

data = sns.load_dataset("iris") plt.figure(figsize=(11, 3)) plt.subplot(121) sns.lineplot(x="sepal_length", y="sepal_width", data=data) plt.subplot(122) sns.lineplot(x="petal_length", y="petal_width", data=data)

Using Seaborn

FacetGrid

grid = sns.FacetGrid(data, col="species") grid.map(plt.plot, "sepal_width")

PairGrid

x_vars = ["sepal_length", "sepal_width", "petal_length", "petal_width"] y_vars = ["species"] grid = sns.PairGrid(data, x_vars=x_vars, y_vars=y_vars) grid.map(sns.barplot)

About

The goal of PythonUniverse is to collect and organize Python skills in one place. We've collected some Python techniques, Python modules, and some libraries for data science. Each Topic includes code snippet and further explanation.

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published