Class 11: Code Profiling#

In this class we will use code profiling to investigate problems with parallelization of NLP pipelines.

Spacy NLP Pipeline#

Run the following snippet, comparing n_process=1 versus n_process=4.

import spacy from spacy.tokens import Doc from nltk.tokenize import TweetTokenizer class MyTokenizer: def __init__(self, vocab): self.vocab = vocab self.rgx = TweetTokenizer().WORD_RE def __call__(self, text): words = self.rgx.findall(text) return Doc(self.vocab, words=words) nlp = spacy.blank("en") nlp.tokenizer = MyTokenizer(nlp.vocab) doc = nlp("What's happened to me? he thought. It wasn't a dream.") print([token.text for token in doc]) outs = ["What's happened to me? he thought. It wasn't a dream."]*100000 docs = nlp.pipe(outs,n_process=1,batch_size=10000) docs = list(docs) 

Profilling tokenization#

Use profiller to debug and understand the issue!

import cProfile, pstats, io from pstats import SortKey import spacy class MyTokenizer: def __init__(self, vocab): self.vocab = vocab self.rgx = TweetTokenizer().WORD_RE def __call__(self, text): words = self.rgx.findall(text) return Doc(self.vocab, words=words) nlp = spacy.blank("en") nlp.tokenizer = MyTokenizer(nlp.vocab) texts = ["What's happened to me? he thought. It wasn't a dream."]*100000 def test_fn(): docs = nlp.pipe(texts,n_process=2,batch_size=1000) docs = list(docs) with cProfile.Profile() as pr: test_fn() # do something s = io.StringIO() sortby = SortKey.CUMULATIVE stats = pstats.Stats(pr) stats.sort_stats(pstats.SortKey.CUMULATIVE) stats.print_stats() 

Note

The code is also availabe as a gist.