How to use langdetect - 10 common examples

To help you get started, we’ve selected a few langdetect examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miku / siskin / siskin / sources / lissa.py View on Github external
dois = [v.replace("http://dx.doi.org/", "") for v in source["identifiers"] if "doi.org" in v] if len(dois) == 0: self.logger.warn("document without DOI") elif len(dois) == 1: doc.update({"doi": dois[0]}) else: # In 08/2019, various DOI seem to work. self.logger.warn("document with multiple dois: %s", dois) doc.update({"doi": dois[0]}) if doc.get("language"): doc.update({"language": doc.get("language")}) else: if len(doc["abstract"]) > 20:  result = langdetect.detect(doc["abstract"]) doc["languages"] = [languages.get(alpha2=result).bibliographic] self.logger.debug("detected %s in abstract (%s)", doc["languages"], doc["abstract"][:40]) # Gather subjects. subjects = source.get("subjects", []) + source.get("subject_synonyms", []) + source.get("tags", []) unique_subjects = set(itertools.chain(*[v.split("|") for v in subjects])) doc.update({"x.subjects": list(unique_subjects)}) # Try date_published, then date_created, then fail. for key in ("date_published", "date_created"): if key not in source or not source[key]: continue doc.update({ "x.date": source[key][:19] + "Z", "rft.date": source[key][:10], })
github kearch / kearch / packages / specialist_crawler_child / webpage.py View on Github external
for script in soup(["script", "style"]): script.extract() # rip javascript out try: self.set_links(soup) except ValueError: raise WebpageError('Cannot set links') try: self.title = str(soup.title.string) self.text = str(soup.body.text) except AttributeError: raise WebpageError('Cannot get title or text') try:  self.language = langdetect.detect(self.text) if not self.language == language: raise WebpageError("Language doesn't match.") except langdetect.lang_detect_exception.LangDetectException: raise WebpageError('Cannot detect language.') self.title_words = self.text_to_words(self.title, language=self.language) # convert all white space to sigle space self.text = ' '.join( filter(lambda x: not x == '', re.split('\s', self.text))) # This version do not respond to mutibyte characters self.summary = self.text[:500] self.words = self.text_to_words(self.text, language=self.language)
github tahaHichri / ML-fomo / main.py View on Github external
def sanitize_text(self, text): try: if detect(text) == 'en':	allow_in_dict = True else:	allow_in_dict = False except:	allow_in_dict = False # remove non-words	sanitized_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split())	self.stop_words = set(stopwords.words('english'))	self.stop_words.update(STOPWORDS)	self.stop_words.update(self.ignored_words)	word_tokens = word_tokenize(sanitized_text) #filtered_sentence = [w for w in word_tokens if not w in stop_words and len(w) > 1] 
github mulhod / reviewer_experience_prediction / util / read_data_files.py View on Github external
try: h = float(lines[i].split()[1].strip()) r = lines[i + 1].split(' ', 1)[1].strip() except (ValueError, IndexError) as e: i += 2 continue # Skip reviews that don't have any characters if not len(r): i += 2 continue # Skip reviews if they cannot be recognized as English try: if not detect(r) == 'en': i += 2 continue  except LangDetectException: i += 2 continue # Now we append the 2-key dict to the end of reviews reviews.append(dict(hours=h, review=r)) i += 2 # Increment i by 2 since we need to go to the next # 2-line couplet return reviews
github kaustubhhiware / facebook-archive / wordclouds.py View on Github external
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import nltk from nltk.stem import PorterStemmer from PIL import Image from nltk.tokenize import sent_tokenize, word_tokenize from langdetect import detect import langdetect as ld nltk.download('maxent_ne_chunker') nltk.download('words') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') PS = PorterStemmer() MASK_LOC = "images/wordclouds/mymask.png" LD_EXC = ld.lang_detect_exception.LangDetectException def wordcloud(): """ Analysing users' posts,comments and friends data. Generate wordclouds of commonly used words from users' posts and comments Find out the most used language in posts and comments Generate wordcloud of friends' names, most tagged in your posts """ loc = input('Enter facebook archive extracted location: ') if not os.path.isdir(loc): print("The provided location doesn't seem to be right") exit(1) fname = loc+'/comments/comments.json'
github City-of-Helsinki / linkedevents / events / importer / util.py View on Github external
paragraphs = re.split(r'(<p></p><p>|\n|</p>|<p>| – |<br><br><br>)+', text) separated = {script: '' for script in scripts} # the first language given is the default one last_language = scripts[0] last_paragraph = '' for paragraph in paragraphs: if paragraph in (r'</p><p>', r'</p>' r'\n', r'<p>', r'<br><br><br>'): # skip paragraph breaks to prevent misdetection separated[last_language] += paragraph last_paragraph = paragraph continue # replace any misleading tags left paragraph_stripped = re.sub(r'(&lt;(/)?strong&gt;)|(<br>)+|&amp;|<a href=".*&quot;">|</a>', ' ', paragraph) try: language = detect(paragraph_stripped)  except LangDetectException: # an exception means no language could be detected language = last_language # langdetect maps "Simplified Chinese" to "zh-cn" # However, we store it as "zh_hans" if language == "zh-cn": language = "zh_hans" if language not in scripts: # only detect allowed languages, no exceptions language = last_language if language != last_language: # fix html paragraph breaks after language change logger.debug('supported language detected: ' + language) if last_paragraph in (r'</p><p>', r'</p>', r'<p>'): separated[last_language] = re.sub(r'</p><p>$', '', separated[last_language]) separated[language] += r'</p><p>' # remove useless dashes after language change</p>
github stanfordnlp / cocoa / craigslistbargain / scripts / generate_scenarios.py View on Github external
def is_valid_line(line): if 'contact' in line.lower(): return False if not re.search(r'\.|\!|\,', line) and len(line.split()) > 15: return False if re.search(r'\$\s*\d+', line): return False try: if langdetect.detect(line) != 'en': return False  except langdetect.lang_detect_exception.LangDetectException: return True return True
github tiotdev / steem-curationbot / curationbot.py View on Github external
def is_eligible(text, n, lng): """Returns True if *text* contains at least *n* words in the specified *lng* language"""  for language in detect_langs(text): if language.lang == lng: probability = language.prob word_count = len(text.split(" ")) if probability * word_count > n: return True else: break return False
github kearch / kearch / webpage.py View on Github external
script.extract() # rip javascript out try: self.set_links(soup) except ValueError: raise WebpageError('Cannot set links') try: self.title = str(soup.title.string) self.text = str(soup.body.text) except AttributeError: raise WebpageError('Cannot get title or text') try: self.language = langdetect.detect(self.text)  except langdetect.lang_detect_exception.LangDetectException: raise WebpageError('Cannot detect language.') self.title_words = self.text_to_words(self.title) # convert all white space to sigle space self.text = ' '.join( filter(lambda x: not x == '', re.split('\s', self.text))) # This version do not respond to mutibyte characters self.text = self.remove_non_ascii_character(self.text) self.summary = self.text[:500] self.words = self.text_to_words(self.text)
github kearch / kearch / packages / kearch_classifier / kearch_classifier / webpage.py View on Github external
try: self.title = str(soup.title.string) self.text = str(soup.body.text) except AttributeError: raise WebpageError('Cannot get title or text') try: print('webpage.py start detecting language ' + url, file=sys.stderr) self.language = langdetect.detect(self.text) print('webpage.py finish detecting language ' + url, file=sys.stderr) if not self.language == language: raise WebpageError("Language doesn't match.")  except langdetect.lang_detect_exception.LangDetectException: raise WebpageError('Cannot detect language.') print('webpage.py start text_to_words for title ' + url, file=sys.stderr) self.title_words = self.text_to_words( self.title, language=self.language) print('webpage.py finish text_to_words for title ' + url, file=sys.stderr) # convert all white space to sigle space # self.text = ' '.join( # filter(lambda x: not x == '', re.split('\s', self.text))) # This version do not respond to mutibyte characters self.summary = self.text[:500] print('webpage.py start text_to_words for text ' + url, file=sys.stderr)