Basic NLP with Python and NLTK Bruni Francesco (@brunifrancesco) Download the original iPython notebook @ https://github.com/brunifrancesco/nltk_base.git
Python - Programming language - Multi-paradigm - Easy to learn - Suitable for multiple needs - Multiple implementations, a ton of useful libraries
Basic Python import random a_number = 1 a_string = "Python rocks!" a_list = ["1", "2", "3"] a_dict = {"film":"Pulp fiction", "francesco": "Python"} print(a_dict.values()) a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1} print(len(a_dict_of_list["key"])) a_tuple = ("Goodfellas", "Kill Bill",) a_list.append(4)
Creating functions def super_function(number): return number * 2 def factorial(n): if n == 0: return 1 else: return n*factorial(n-1) double = lambda item: item * 2 predicate = lambda item: item > 3 assert super_function(3) == 6 assert factorial(3) == 6 assert double(3) == 6 assert list(filter(predicate, [1,2,5,3])) == [5]
And much more - Object oriented paradigm --> classes, metaclasses etc. etc. - Functional programming paradigm --> partials, closures, high order functions etc. etc. - Scripting paradigm --> shell control, os related functions etc.. - Async ops support --> asyncio
Reading files with open("file", "r") as input: data = input.read() import csv def read_csv(): with open('data.csv', 'r') as francesco: data = csv.reader(francesco, delimiter=';') for element in data: print(element[1]) read_csv()
Make data talk from collections import Counter import statistics splitted_chunks = data.split() print("Data lenght: %s" %len(data)) print("Chunks numbers: %s" %len(splitted_chunks)) print("Unique chunks: %s" %len(set(splitted_chunks))) print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks))) print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks))) print("Frequency distribution: %s" % sorted(filter(lambda item: item[1] > 5, Counter(splitted_chunks).items()), key=lambda item: item[1]))
NLTK - tokenization - stemming - tagging - parsing - semantic reasoning - classification
Tokenizing from nltk import word_tokenize tokens = word_tokenize(data) from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True) s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' tw_tokens = tokenizer.tokenize(s1) print(tw_tokens)
Frequency distribution from nltk.book import FreqDist fdist1 = FreqDist(splitted_chunks) most_common = fdist1.most_common(50) fdist1.plot(50, cumulative=True) fdist1.plot(10) print("Max frequency key: %s" %fdist1.max()) print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"]) print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))
Cleaning data from nltk.corpus import stopwords def remove_stopword(word): return word not in words import string words = stopwords.words('italian') lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks)) print("Chunks lenght %s" %len(lowered_chunks)) clean_chunks = list(filter(remove_stopword, splitted_chunks)) print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks)) clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks)) print("Cleaned chunks (without punctuation and stopwords) lenght: %s" %len(clean_chunks)) from nltk.book import FreqDist fdist1 = FreqDist(clean_chunks) most_common = fdist1.most_common(50)
Stemming from nltk.stem.porter import * from nltk.stem.snowball import * stemmer = PorterStemmer() stemmer.stem(“activities") available_langs = SnowballStemmer.languages sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True) print(sn_stemmer.stem("ordenador")) from nltk.stem.lancaster import * LancasterStemmer().stem("activities")
Custom ngrams finder def find_and_analyze_ngrams(tagged_sent): chunker = RegexpParser(CHUNK_RULE) tree = chunker.parse(tagged_sent) for item in self.__leaves(tree): if not item == tagged_sent: probable_ngram = ' '.join(self.__stemmer.stem( word.lower()) for (word, pos) in item ) if self.__evaluate_polarity_ngram(probable_ngram): yield probable_ngram
Classifying data def __get_elements_for_classification(self, lfeats, train_number, classifying=True): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): if classifying: train_feats.extend([(feat, label) for feat in feats]) else: cutoff = train_number * len(feats)/10 train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]]) nb_classifier = NaiveBayesClassifier.train(train_feats) return train_feats, test_feats, nb_classifier
Pointwise Mutual Information PMI(X = x, Y = y) = log p(X = x, Y = y) p(X = x)p(Y = y)
Measure PMI - Read from csv - Preprocess data (tokenize, lower, remove stopwords, punctuation) - Find frequency distribution for unigrams - Find frequency distribution for bigrams - Compute PMI via implemented function - Let NLTK sort bigrams by PMI metric - Write result to CSV file
Read data import nltk from nltk.corpus import stopwords import string import random from itertools import chain import math import csv import time def read_data(): """ Read data 'libe by line'""" with open('data.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: yield row
Preprocess def preprocess(data): """ Preprocess data, filtering out stopwords, punctuation and lowering all splitted tokens :param data: the string data to be processed """ italian_stopwords = stopwords.words('italian') splitted_chunks = data.split() lowered_chunks = (item.lower() for item in splitted_chunks) chunks_without_punctuation = (chunk for chunk in lowered_chunks if chunk not in string.punctuation) chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation if chunk not in italian_stopwords) return list(chunks_without_stopwords)
Find N-Grams FREQUENCY_TRESHOLD = 2 def find_bigrams(splitted_chunks): """ Find bigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks) bigrams.apply_freq_filter(FREQUENCY_TRESHOLD) return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()} def find_unigrams(splitted_chunks): """ Find unigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ unigrams = nltk.FreqDist(splitted_chunks) return {unigram: freq for unigram, freq in unigrams.items() if freq > FREQUENCY_TRESHOLD - 1}
Compute PMI def pmi(word1, word2, unigram_freq, bigram_freq): """ Find PMI measure :param word1: the first word :param word2: the second word :param unigram_freq: the unigram frequency container :param bigram_freq: the bigram frequency container """ prob_word1 = unigram_freq[word1] / sum(unigram_freq.values()) prob_word2 = unigram_freq[word2] / sum(unigram_freq.values()) prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values()) a = prob_word1_word2/prob_word1*prob_word2 return round(math.log(a,2),2)
Write result to CSV def write_data(result): """ Write result to CSV file :param result: the list to be written to csv file """ with open("result.csv", "a") as output: writer = csv.writer(output, delimiter='*') for row in result: writer.writerow(row)
Happy coding :)

Basic NLP with Python and NLTK

  • 1.
    Basic NLP withPython and NLTK Bruni Francesco (@brunifrancesco) Download the original iPython notebook @ https://github.com/brunifrancesco/nltk_base.git
  • 2.
    Python - Programming language -Multi-paradigm - Easy to learn - Suitable for multiple needs - Multiple implementations, a ton of useful libraries
  • 3.
    Basic Python import random a_number= 1 a_string = "Python rocks!" a_list = ["1", "2", "3"] a_dict = {"film":"Pulp fiction", "francesco": "Python"} print(a_dict.values()) a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1} print(len(a_dict_of_list["key"])) a_tuple = ("Goodfellas", "Kill Bill",) a_list.append(4)
  • 4.
    Creating functions def super_function(number): returnnumber * 2 def factorial(n): if n == 0: return 1 else: return n*factorial(n-1) double = lambda item: item * 2 predicate = lambda item: item > 3 assert super_function(3) == 6 assert factorial(3) == 6 assert double(3) == 6 assert list(filter(predicate, [1,2,5,3])) == [5]
  • 5.
    And much more -Object oriented paradigm --> classes, metaclasses etc. etc. - Functional programming paradigm --> partials, closures, high order functions etc. etc. - Scripting paradigm --> shell control, os related functions etc.. - Async ops support --> asyncio
  • 6.
    Reading files with open("file","r") as input: data = input.read() import csv def read_csv(): with open('data.csv', 'r') as francesco: data = csv.reader(francesco, delimiter=';') for element in data: print(element[1]) read_csv()
  • 7.
    Make data talk fromcollections import Counter import statistics splitted_chunks = data.split() print("Data lenght: %s" %len(data)) print("Chunks numbers: %s" %len(splitted_chunks)) print("Unique chunks: %s" %len(set(splitted_chunks))) print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks))) print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks))) print("Frequency distribution: %s" % sorted(filter(lambda item: item[1] > 5, Counter(splitted_chunks).items()), key=lambda item: item[1]))
  • 8.
    NLTK - tokenization - stemming -tagging - parsing - semantic reasoning - classification
  • 9.
    Tokenizing from nltk importword_tokenize tokens = word_tokenize(data) from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True) s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' tw_tokens = tokenizer.tokenize(s1) print(tw_tokens)
  • 10.
    Frequency distribution from nltk.bookimport FreqDist fdist1 = FreqDist(splitted_chunks) most_common = fdist1.most_common(50) fdist1.plot(50, cumulative=True) fdist1.plot(10) print("Max frequency key: %s" %fdist1.max()) print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"]) print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))
  • 11.
    Cleaning data from nltk.corpusimport stopwords def remove_stopword(word): return word not in words import string words = stopwords.words('italian') lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks)) print("Chunks lenght %s" %len(lowered_chunks)) clean_chunks = list(filter(remove_stopword, splitted_chunks)) print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks)) clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks)) print("Cleaned chunks (without punctuation and stopwords) lenght: %s" %len(clean_chunks)) from nltk.book import FreqDist fdist1 = FreqDist(clean_chunks) most_common = fdist1.most_common(50)
  • 12.
    Stemming from nltk.stem.porter import* from nltk.stem.snowball import * stemmer = PorterStemmer() stemmer.stem(“activities") available_langs = SnowballStemmer.languages sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True) print(sn_stemmer.stem("ordenador")) from nltk.stem.lancaster import * LancasterStemmer().stem("activities")
  • 13.
    Custom ngrams finder deffind_and_analyze_ngrams(tagged_sent): chunker = RegexpParser(CHUNK_RULE) tree = chunker.parse(tagged_sent) for item in self.__leaves(tree): if not item == tagged_sent: probable_ngram = ' '.join(self.__stemmer.stem( word.lower()) for (word, pos) in item ) if self.__evaluate_polarity_ngram(probable_ngram): yield probable_ngram
  • 14.
    Classifying data def __get_elements_for_classification(self,lfeats, train_number, classifying=True): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): if classifying: train_feats.extend([(feat, label) for feat in feats]) else: cutoff = train_number * len(feats)/10 train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]]) nb_classifier = NaiveBayesClassifier.train(train_feats) return train_feats, test_feats, nb_classifier
  • 15.
    Pointwise Mutual Information PMI(X =x, Y = y) = log p(X = x, Y = y) p(X = x)p(Y = y)
  • 16.
    Measure PMI - Readfrom csv - Preprocess data (tokenize, lower, remove stopwords, punctuation) - Find frequency distribution for unigrams - Find frequency distribution for bigrams - Compute PMI via implemented function - Let NLTK sort bigrams by PMI metric - Write result to CSV file
  • 17.
    Read data import nltk fromnltk.corpus import stopwords import string import random from itertools import chain import math import csv import time def read_data(): """ Read data 'libe by line'""" with open('data.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: yield row
  • 18.
    Preprocess def preprocess(data): """ Preprocess data,filtering out stopwords, punctuation and lowering all splitted tokens :param data: the string data to be processed """ italian_stopwords = stopwords.words('italian') splitted_chunks = data.split() lowered_chunks = (item.lower() for item in splitted_chunks) chunks_without_punctuation = (chunk for chunk in lowered_chunks if chunk not in string.punctuation) chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation if chunk not in italian_stopwords) return list(chunks_without_stopwords)
  • 19.
    Find N-Grams FREQUENCY_TRESHOLD =2 def find_bigrams(splitted_chunks): """ Find bigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks) bigrams.apply_freq_filter(FREQUENCY_TRESHOLD) return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()} def find_unigrams(splitted_chunks): """ Find unigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ unigrams = nltk.FreqDist(splitted_chunks) return {unigram: freq for unigram, freq in unigrams.items() if freq > FREQUENCY_TRESHOLD - 1}
  • 20.
    Compute PMI def pmi(word1,word2, unigram_freq, bigram_freq): """ Find PMI measure :param word1: the first word :param word2: the second word :param unigram_freq: the unigram frequency container :param bigram_freq: the bigram frequency container """ prob_word1 = unigram_freq[word1] / sum(unigram_freq.values()) prob_word2 = unigram_freq[word2] / sum(unigram_freq.values()) prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values()) a = prob_word1_word2/prob_word1*prob_word2 return round(math.log(a,2),2)
  • 21.
    Write result toCSV def write_data(result): """ Write result to CSV file :param result: the list to be written to csv file """ with open("result.csv", "a") as output: writer = csv.writer(output, delimiter='*') for row in result: writer.writerow(row)
  • 22.