python例子：文章分析工具（可視化）

奧莉芙小異 2024-08-17 發(fā)布于江西

展開全文

作品介紹

作品名稱：文章分析工具（可視化）

開發(fā)環(huán)境：PyCharm 2023.3.4 + python3.7

用到的庫(kù)：sys、os、pdfplumber（獲取pdf的內(nèi)容）、snownlp、docx（獲取word文件的內(nèi)容）

作品簡(jiǎn)介：運(yùn)行例子后，先選擇文件（支持txt、pdf和docx三種類型），然后點(diǎn)擊按鈕“分析文章”，最后把文章分析出的關(guān)鍵詞和摘要回顯到界面。

實(shí)現(xiàn)過(guò)程

一、閱讀器UI設(shè)計(jì)

1、安裝模塊和配置工具，參考《python例子：翻譯器（簡(jiǎn)單》；

2、運(yùn)行工具QtDesigner，利用QtDesigner工具箱設(shè)計(jì)出界面效果（所需要的控件可查看右邊區(qū)域），保存效果為文件aa.ui；

3、對(duì)文件aa.ui執(zhí)行pyUIC（ui轉(zhuǎn)化為py代碼），執(zhí)行完生成文件aa.py。

二、代碼設(shè)計(jì)

1、新建文件articleAnalysis.py，該文件為項(xiàng)目主文件，初始化頁(yè)面并顯示；

2、添加內(nèi)置模塊（下面代碼使用）和主方法（用于運(yùn)行后彈出界面）；

from PyQt5.QtWidgets import *# 引入自定義模塊import aa# 引入庫(kù)import sysimport osimport pdfplumberfrom snownlp import SnowNLPfrom docx import Document

class parentWindow(QWidget, aa.Ui_Form):    # 初始化方法    def __init__(self):        # 找到父類 首頁(yè)面        super(parentWindow, self).__init__()        # 初始化頁(yè)面方法        self.setupUi(self)        self.download_path = ''  # 選擇文件的信息        # 點(diǎn)擊選擇文件        self.selectButton.clicked.connect(self.select_file)        # 點(diǎn)擊分析文章        self.analysisButton.clicked.connect(self.analysis_article)

if __name__ == '__main__': # 每一個(gè)PyQt5應(yīng)用都必須創(chuàng)建一個(gè)應(yīng)用對(duì)象 app = QApplication(sys.argv) # 初始化頁(yè)面 window = parentWindow() # 顯示首頁(yè) window.show() sys.exit(app.exec_())

3、增加函數(shù)select_file，用來(lái)實(shí)現(xiàn)選擇文件，并把文件路徑回顯到界面；

# 選擇文件def select_file(self):    # 設(shè)置文件過(guò)濾器    filter_types = '文本文件 (*.txt);;Word 文件 (*.docx *.doc);;PDF 文件 (*.pdf)'    # 啟動(dòng)選擇文件對(duì)話框    self.download_path = QFileDialog.getOpenFileName(self, '選擇要分析的文章', os.getcwd() + '/files',                                                     filter_types)    # 判斷是否選擇圖片    if not self.download_path[0].strip():        QMessageBox.information(self, '提示信息', '沒有選擇文件')        self.download_path = ''        pass    else:        self.fileEdit.setText(self.download_path[0])

4、增加公共函數(shù)conversion_data，用來(lái)把列表數(shù)據(jù)轉(zhuǎn)換為字符串?dāng)?shù)據(jù)；

# 列表轉(zhuǎn)字符串def conversion_data(self, lists, symbol): str_text = f'{symbol.join(lists)}' return str_text

5、增加函數(shù)get_txt，用來(lái)獲取txt文本的所有內(nèi)容；

# 獲取txt的內(nèi)容def get_txt(self, file_path):    # 獲取文本    with open(file_path, 'r', encoding='utf-8') as file:        # 讀取文件內(nèi)容        full_text = file.read()    return full_text

6、增加函數(shù)get_word，用來(lái)獲取word文件的所有內(nèi)容；

# 獲取word的內(nèi)容def get_word(self, file_path): # 打開Word文檔 doc = Document(file_path) # 初始化一個(gè)空字符串以存儲(chǔ)所有段落的文本 full_text = ''

# 遍歷文檔中的每個(gè)段落 for para in doc.paragraphs: # 將當(dāng)前段落的文本添加到全文字符串中 full_text += f'{para.text}\n' # 返回合并后的文本 return full_text

7、增加函數(shù)get_pdf，用來(lái)獲取pdf文件的所有內(nèi)容；

# 獲取pdf的內(nèi)容def get_pdf(self, file_path):    with pdfplumber.open(file_path) as pdf:        # 初始化一個(gè)空字符串以存儲(chǔ)所有頁(yè)面的文本        full_text = ''
        # 遍歷PDF中的所有頁(yè)面        for i, page in enumerate(pdf.pages, start=1):            # 提取當(dāng)前頁(yè)面的文本            page_text = page.extract_text()
            # 將當(dāng)前頁(yè)面的文本添加到全文字符串中            full_text += f'{page_text}\n'
        # 返回合并后的文本        return full_text

8、增加函數(shù)analysis_article，用來(lái)實(shí)現(xiàn)分析文章的功能；

# 點(diǎn)擊分析文章def analysis_article(self): # 判斷是否選擇文件 if not self.download_path: QMessageBox.information(self, '提示信息', '請(qǐng)先選擇文件') return

# 獲取后綴名并去掉. file_extension = os.path.splitext(self.download_path[0])[1][1:]

# 判斷不同類型調(diào)用不同方法 if file_extension == 'txt': content = self.get_txt(self.download_path[0]) elif file_extension == 'docx' or file_extension == 'doc': content = self.get_word(self.download_path[0]) elif file_extension == 'pdf': content = self.get_pdf(self.download_path[0]) s = SnowNLP(content)

    keywords = s.keywords(5, True)  # 提取5個(gè)關(guān)鍵詞，第二個(gè)參數(shù)指示是否使用TF-IDF算法

self.keywordEdit.setText(self.conversion_data(keywords, ',')) summary = s.summary(3) # 生成3句摘要 self.abstractEdit.setText(self.conversion_data(summary, '\n'))

9、全部代碼如下。

'''示例：文章分析工具（可視化）'''from PyQt5.QtWidgets import *# 引入自定義模塊import aa# 引入庫(kù)import sysimport osimport pdfplumberfrom snownlp import SnowNLPfrom docx import Document

class parentWindow(QWidget, aa.Ui_Form):    # 初始化方法    def __init__(self):        # 找到父類 首頁(yè)面        super(parentWindow, self).__init__()        # 初始化頁(yè)面方法        self.setupUi(self)        self.download_path = ''  # 選擇文件的信息        # 點(diǎn)擊選擇文件        self.selectButton.clicked.connect(self.select_file)        # 點(diǎn)擊分析文章        self.analysisButton.clicked.connect(self.analysis_article)
    # 選擇文件    def select_file(self):        # 設(shè)置文件過(guò)濾器        filter_types = '文本文件 (*.txt);;Word 文件 (*.docx *.doc);;PDF 文件 (*.pdf)'        # 啟動(dòng)選擇文件對(duì)話框        self.download_path = QFileDialog.getOpenFileName(self, '選擇要分析的文章', os.getcwd() + '/files',                                                         filter_types)        # 判斷是否選擇圖片        if not self.download_path[0].strip():            QMessageBox.information(self, '提示信息', '沒有選擇文件')            self.download_path = ''            pass        else:            self.fileEdit.setText(self.download_path[0])
    # 點(diǎn)擊分析文章    def analysis_article(self):        # 判斷是否選擇文件        if not self.download_path:            QMessageBox.information(self, '提示信息', '請(qǐng)先選擇文件')            return
        # 獲取后綴名并去掉.        file_extension = os.path.splitext(self.download_path[0])[1][1:]
        # 判斷不同類型調(diào)用不同方法        if file_extension == 'txt':            content = self.get_txt(self.download_path[0])        elif file_extension == 'docx' or file_extension == 'doc':            content = self.get_word(self.download_path[0])        elif file_extension == 'pdf':            content = self.get_pdf(self.download_path[0])        s = SnowNLP(content)        keywords = s.keywords(5, True)  # 提取5個(gè)關(guān)鍵詞，第二個(gè)參數(shù)指示是否使用TF-IDF算法        self.keywordEdit.setText(self.conversion_data(keywords, ','))        summary = s.summary(3)  # 生成3句摘要        self.abstractEdit.setText(self.conversion_data(summary, '\n'))
    # 獲取txt的內(nèi)容    def get_txt(self, file_path):        # 獲取文本        with open(file_path, 'r', encoding='utf-8') as file:            # 讀取文件內(nèi)容            full_text = file.read()        return full_text
    # 獲取word的內(nèi)容    def get_word(self, file_path):        # 打開Word文檔        doc = Document(file_path)        # 初始化一個(gè)空字符串以存儲(chǔ)所有段落的文本        full_text = ''
        # 遍歷文檔中的每個(gè)段落        for para in doc.paragraphs:            # 將當(dāng)前段落的文本添加到全文字符串中            full_text += f'{para.text}\n'        # 返回合并后的文本        return full_text
    # 獲取pdf的內(nèi)容    def get_pdf(self, file_path):        with pdfplumber.open(file_path) as pdf:            # 初始化一個(gè)空字符串以存儲(chǔ)所有頁(yè)面的文本            full_text = ''
            # 遍歷PDF中的所有頁(yè)面            for i, page in enumerate(pdf.pages, start=1):                # 提取當(dāng)前頁(yè)面的文本                page_text = page.extract_text()
                # 將當(dāng)前頁(yè)面的文本添加到全文字符串中                full_text += f'{page_text}\n'
            # 返回合并后的文本            return full_text
    # 列表轉(zhuǎn)字符串    def conversion_data(self, lists, symbol):        str_text = f'{symbol.join(lists)}'        return str_text

if __name__ == '__main__':    # 每一個(gè)PyQt5應(yīng)用都必須創(chuàng)建一個(gè)應(yīng)用對(duì)象    app = QApplication(sys.argv)    # 初始化頁(yè)面    window = parentWindow()    # 顯示首頁(yè)    window.show()    sys.exit(app.exec_())

本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間，所有內(nèi)容均由用戶發(fā)布，不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購(gòu)買等信息，謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容，請(qǐng)點(diǎn)擊一鍵舉報(bào)。

轉(zhuǎn)藏 分享

QQ空間 QQ好友新浪微博微信

獻(xiàn)花（0） +1

來(lái)自：奧莉芙小異 > 《待分類》

舉報(bào)/認(rèn)領(lǐng)