Posted on Aug 23, 2024

Extractive summarization of PDF files with Spacy

Here is a simple script to accomplish this task :

In your vitual environment run :

pip install PyPDF2 pip install spacy python -m spacy download fr_core_news_sm

import PyPDF2 import spacy from spacy.lang.fr.stop_words import STOP_WORDS from string import punctuation from collections import Counter from heapq import nlargest import re def extract_text_from_pdf(file_path): """Extracts text from a PDF file.""" pdf_file_obj = open(file_path, 'rb') pdf_reader = PyPDF2.PdfReader(pdf_file_obj) text = "" for page_num in range(len(pdf_reader.pages)): page_obj = pdf_reader.pages[page_num] text += page_obj.extract_text() pdf_file_obj.close() return text def summarize(text, ratio=0.0013): """Summarizes the given text using SpaCy.""" nlp = spacy.load('fr_core_news_sm') doc = nlp(text) tokens = [token.text for token in doc if not token.is_stop and not token.is_punct] word_frequencies = {} for word in tokens: if word not in word_frequencies.keys(): word_frequencies[word] = 1 else: word_frequencies[word] += 1 max_frequency = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = word_frequencies[word] / max_frequency sentence_scores = {} for sent in doc.sents: for word_value in sent: if word_value.text.lower() in word_frequencies.keys(): if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word_value.text.lower()] else: sentence_scores[sent] += word_frequencies[word_value.text.lower()] select_length = int(len(sentence_scores) * ratio) summary_sentences = nlargest(select_length, sentence_scores, key=sentence_scores.get) final_summary = [str(sentence) for sentence in summary_sentences] summary = ' '.join(final_summary) return summary # Path to your PDF file file_path = 'sy.pdf' # Extract text from PDF pdf_text = extract_text_from_pdf(file_path) summary = summarize(pdf_text ) print("Summary:") print(summary)

DEV Community

Extractive summarization of PDF files with Spacy

Top comments (0)