[Fix] Brought the names of variables and methods to snake_case.

[Fix] Changed the method for removing duplicates of table paragraphs so that the addition of a structural element takes place outside of it

[Fix] Brought the names of variables and methods to snake_case.
[Fix] Changed the method for removing duplicates of table paragraphs so that the addition of a structural element takes place outside of it
4a9fd476 · slavamarcin · 96a5dee4 · 4a9fd476 · 4a9fd476 · 4a9fd476
Commit 4a9fd476 authored 2 years ago by slavamarcin
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 349 additions and 342 deletions
+349 -342
--- a/src/Class/DocumentClass.py
+++ b/src/Class/DocumentClass.py
@@ -18,22 +18,22 @@ Parameters:
 Methods

 ----------
-    addContent(id, paragraph)
+    add_content(id, paragraph)
        Adds a paragraph to the content list
    
-    ptToSm(value)
+    pt_to_sm(value)
        Converts topographical points to centimeters
    
-    dmToSm(value)
+    dm_to_sm(value)
        Converts inches to centimeters
    
-    createJsonToClasifier(listOfAttr)
+    create_json_to_clasifier(listOfAttr)
        Creates and returns a json string, which will later be sent to the classifier
    
-    requestToClasify(jsonText, api =)
+    request_to_clasify(jsonText, api =)
        Sends a request to the classification module
    
-    writeCSV(path):
+    write_CSV(path):
        Generates csv file based on content
    
 """
@@ -43,7 +43,7 @@ class Class:
        self.__time = time
        self.__content = {}

-    def addContent(self, paragraph_id, paragraph):
+    def add_content(self, paragraph_id, paragraph):
        """

        Adds a paragraph to the content list
@@ -55,7 +55,7 @@ class Class:
        self.content[paragraph_id] = paragraph
    ## Пункт в сантиметры
    @classmethod
-    def ptToSm(cls, value):
+    def pt_to_sm(cls, value):
        """

        Converts topographical points to centimeters
@@ -68,7 +68,7 @@ class Class:
        return value/28.346
    ## Дюйм в сантиметры
    @classmethod
-    def dmToSm(cls, value):
+    def dm_to_sm(cls, value):
        """

        Converts inches to centimeters
@@ -80,18 +80,18 @@ class Class:
        """
        return value * 2.54

-    def createJsonToClasifier(self, listOfAttr = ["countOfSpSbl","countSbl","lowercase","uppercase","lastSbl",
-                                                  "firstkey","prevEl","curEl","nexEl","bold","italics",
-                                                  "keepLinesTogether","keepWithNext", "outlineLevel",
-                                                  "pageBreakBefore"]
+    def create_json_to_clasifier(self, list_of_attr = ["countn_of_sp_sbl","count_sbl","lowercase","uppercase","last_sbl",
+                                                  "firstkey","prev_el","cur_el","next_el","bold","italics",
+                                                  "keep_lines_together","keep_with_next", "outline_level",
+                                                  "page_breake_before"]
                              ):
        """

        Creates and returns a json string, which will later be sent to the classifier

-        :param listOfAttr: List of attributes included in json string
+        :param list_of_attr: List of attributes included in json string

-        :return jsonText: Generated Json string
+        :return json_text: Generated Json string

        """

@@ -104,22 +104,22 @@ class Class:
            if p.__class__ != PDFTable :
                s = s + "\"" + str(i) + "\": {\""
                for attribute in dir(p):
-                    if not attribute.startswith('_') and attribute in listOfAttr:
+                    if not attribute.startswith('_') and attribute in list_of_attr:
                        s = s + attribute + "\": \"" + str(getattr(p,attribute)) + "\",\""
                l = len(s)
                s = s[:l - 2] + "}, "
        l = len(s)
        s = s[:l - 2] + "}}"
-        jsonText = json.loads(s)
-        return jsonText
+        json_text = json.loads(s)
+        return json_text

    @classmethod
-    def requestToClasify(cls, jsonText, api = "http://127.0.0.1:8001/clasify"):
+    def request_to_clasify(cls, json_text, api = "http://127.0.0.1:8001/clasify"):
        """

        Sends a request to the classification module

-        :param jsonText: The json string to send
+        :param json_text: The json string to send
        :param api: API where the request is sent

        :return response: Response received from the API
@@ -127,10 +127,10 @@ class Class:
        """

        import requests
-        response = requests.post(api, json= jsonText)
+        response = requests.post(api, json= json_text)
        return response

-    def writeCSV(self, path = 'pdftocsv.csv'):
+    def write_CSV(self, path = 'pdftocsv.csv'):

        """

@@ -143,16 +143,16 @@ class Class:
        import csv
        with open(path, 'w', newline='', encoding="utf-8") as csvfile:
            filewriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
-            filewriter.writerow(["text","countOfSpSbl","countSbl","uppercase", "lowercase","fontName","lastSbl",
-                                 "firstkey","indent","lineSpacing","textSize"])
+            filewriter.writerow(["text","countn_of_sp_sbl","count_sbl","uppercase", "lowercase","font_name","last_sbl",
+                                 "firstkey","indent","line_spasing","text_size"])
            for key in self.content.keys():
                if type(self.content.get(key))!= PDFTable:
-                    filewriter.writerow([self.content.get(key).text, self.content.get(key).countOfSpSbl ,
-                                         self.content.get(key).countSbl,self.content.get(key).uppercase,
-                                         self.content.get(key).lowercase, self.content.get(key).fontName,
-                                         self.content.get(key).lastSbl,self.content.get(key).firstkey,
-                                         self.content.get(key).indent, self.content.get(key).lineSpacing,
-                                         self.content.get(key).textSize])
+                    filewriter.writerow([self.content.get(key).text, self.content.get(key).countn_of_sp_sbl,
+                                         self.content.get(key).count_sbl,self.content.get(key).uppercase,
+                                         self.content.get(key).lowercase, self.content.get(key).font_name,
+                                         self.content.get(key).last_sbl,self.content.get(key).firstkey,
+                                         self.content.get(key).indent, self.content.get(key).line_spasing,
+                                         self.content.get(key).text_size])
                else:
                    filewriter.writerow([self.content.get(key).text])


--- a/src/Class/Paragraph.py
+++ b/src/Class/Paragraph.py
--- a/src/PDF/PDFParser.py
+++ b/src/PDF/PDFParser.py
@@ -3,7 +3,7 @@ import pdfplumber
 from src.Class.DocumentClass import Class
 from src.Class.Paragraph import Paragraph
 from src.PDF.Line import Line
-from src.PDF.ParagraphLine import Pdfparagraph
+from src.PDF.ParagraphLine import PdfParagraph
 from src.PDF.Table import PDFTable


@@ -58,11 +58,11 @@ class PDFParser:
    @path.setter
    def path(self, path):
        self._path = path
+
    @path.setter
    def prevEl(self, path):
        self._path = path

-
    @property
    def pdf(self):
        return self._pdf
@@ -95,7 +95,7 @@ class PDFParser:
    def list_of_table(self, list_of_table):
        self._list_of_table = list_of_table

-    def getLinesAndTables(self):
+    def get_lines_and_tables(self):

        """

@@ -117,7 +117,7 @@ class PDFParser:
        no_change_size = True

        for number_of_page, page in enumerate(self.pdf.pages):
-            #Extracting tables and tabular text
+            # Extracting tables and tabular text
            tables = page.find_tables()
            tables_text = page.extract_tables()
            for number_of_table, table in enumerate(tables):
@@ -125,19 +125,19 @@ class PDFParser:
                current_table.addText(tables_text[number_of_table])
                self.list_of_table.append(current_table)

-            #Selecting text strings
+            # Selecting text strings
            text = ""
            for i, char in enumerate(page.chars):
                if y0 is not None:
                    y0 = round(y0)
-                #Condition for adding a character to a string
+                # Condition for adding a character to a string
                if (round(char.get('y0')) == y0) or (int(char.get('y0')) == y0) \
                        or text == '−' or text == '–' or text == "•":
                    chars.append(char)
                    text = text + char.get('text')
                    x1 = char.get('x1')
                    if i != 0:
-                        #Font and line size selection
+                        # Font and line size selection
                        if char.get('fontname') not in fontname:
                            no_change_font_name = False
                            fontname.append(char.get('fontname'))
@@ -151,7 +151,7 @@ class PDFParser:
                        y1 = char.get('y1')
                else:
                    if i != 0:
-                        #Deleting headers and footers
+                        # Deleting headers and footers
                        if re.search(r'^\d+ $', text) is None and y0 > 60 and text != '':
                            if len(chars) != 0:
                                x0 = chars[0].get('x0')
@@ -167,7 +167,7 @@ class PDFParser:
                    size = []
                    no_change_font_name = True
                    no_change_size = True
-                    #Deleting empty lines
+                    # Deleting empty lines
                    if text == "" and char.get('text') == ' ':
                        continue
                    chars.append(char)
@@ -182,13 +182,14 @@ class PDFParser:
        return self.lines, self.list_of_table

    @staticmethod
-    def getSpace(lines):
+    def get_space(lines):

        """

        Calculates the line spacing between two subsequent lines

-        :param lines: list of document lines
+        :param
+            lines: list of document lines

        :return
            spaces: List of calculated line spacing
@@ -198,7 +199,7 @@ class PDFParser:
        spaces = []
        i = 0
        while i < len(lines):
-            #If the line is the last one on the page, it is assigned a value equal to zero
+            # If the line is the last one on the page, it is assigned a value equal to zero
            if i != len(lines) - 1 and (lines[i].y0 - lines[i + 1].y1 > 0):
                spaces.append(lines[i].y0 - lines[i + 1].y1)
            else:
@@ -206,35 +207,35 @@ class PDFParser:
            i = i + 1
        return spaces

-    def addParagraphInDocumentWithAttribute(self, pdfparagraph, paragraph_id):
+    def add_paragraph_in_document_with_attribute(self, pdf_paragraph, paragraph_id):

        """
        Calculates the properties and attributes of a paragraph and adds it to the list of structural elements of the document

        :param
-            pdfparagraph: An object representing a paragraph highlighted by the algorithm
+            pdf_paragraph: An object representing a paragraph highlighted by the algorithm
            paragraph_id: Id of paragraph
        :return

        """
-        #Highlighting string attributes
-        no_change_font_name = pdfparagraph.lines[0].nochangeFontName
-        no_change_text_size = pdfparagraph.lines[0].nochangeSize
-        for line in pdfparagraph.lines:
+        # Highlighting string attributes
+        no_change_font_name = pdf_paragraph.lines[0].nochangeFontName
+        no_change_text_size = pdf_paragraph.lines[0].nochangeSize
+        for line in pdf_paragraph.lines:
            if len(line.fontname) > 1 or line.nochangeFontName is False:
                no_change_font_name = False
            if len(line.size) > 1 or line.nochangeSize is False:
                no_change_text_size = False
-        if len(pdfparagraph.lines[0].size) != 0:
-            pdfparagraph.text_size = pdfparagraph.lines[0].size[0]
-        if len(pdfparagraph.lines[0].fontname) != 0:
-            pdfparagraph.fontname = pdfparagraph.lines[0].fontname[0]
-        pdfparagraph.no_change_font_name = no_change_font_name
-        pdfparagraph.no_change_text_size = no_change_text_size
-        pdfparagraph.indent = pdfparagraph.lines[0].x0
-        self.document.content[paragraph_id] = self.getStandartParagraph(pdfparagraph)
-
-    def getParagraph(self, lines, spaces, list_of_table):
+        if len(pdf_paragraph.lines[0].size) != 0:
+            pdf_paragraph.text_size = pdf_paragraph.lines[0].size[0]
+        if len(pdf_paragraph.lines[0].fontname) != 0:
+            pdf_paragraph.fontname = pdf_paragraph.lines[0].fontname[0]
+        pdf_paragraph.no_change_font_name = no_change_font_name
+        pdf_paragraph.no_change_text_size = no_change_text_size
+        pdf_paragraph.indent = pdf_paragraph.lines[0].x0
+        self.document.content[paragraph_id] = self.get_standart_paragraph(pdf_paragraph)
+
+    def get_paragraph(self, lines, spaces, list_of_table):
        """

        Generates paragraphs from a list of lines
@@ -250,7 +251,7 @@ class PDFParser:
        """

        i = 1
-        paragraph = Pdfparagraph()
+        paragraph = PdfParagraph()
        paragraph.lines.append(lines[0])
        paragraph.spaces.append(spaces[0])
        paragraph_id = 1
@@ -261,19 +262,26 @@ class PDFParser:
            while j < len(paragraph.lines) - 1:
                mean = mean + paragraph.spaces[j]
                j = j + 1
-            #Calculating the average value of the line spacing
+            # Calculating the average value of the line spacing
            if len(paragraph.lines) - 1 > 1:
                mean = mean / (len(paragraph.lines) - 1)
            if mean == 0:
                mean = spaces[i - 1]
            if spaces[i - 1] == 0:
                spaces[i - 1] = mean
-            #Condition for paragraph selection
+            # Condition for paragraph selection
            if (lines[i - 1].x0 < lines[i].x0 or lines[i - 1].x1 <= 520 or abs(spaces[i - 1] - mean) > 2 or (
                    len(paragraph.lines) == 1 and paragraph.lines[0].x0 == lines[i].x0)):
                paragraph.line_spacing = mean
-                paragraph_id, removed_tables, list_of_table  = self.deleteDublicatesAndAddParagraph(paragraph, removed_tables, paragraph_id, list_of_table)
-                paragraph = Pdfparagraph()
+                element, removed_tables, list_of_table = self.delete_dublicates(paragraph, removed_tables,
+                                                                                              list_of_table)
+                if element is not None:
+                    if type(element) == PDFTable:
+                        self.document.add_content(paragraph_id, element)
+                    else:
+                        self.add_paragraph_in_document_with_attribute(element, paragraph_id)
+                    paragraph_id += 1
+                paragraph = PdfParagraph()
                paragraph.lines.append(lines[i])
                paragraph.spaces.append(spaces[i])
            else:
@@ -285,14 +293,14 @@ class PDFParser:
        return self.document

    @staticmethod
-    def getStandartParagraph(pdfparagraph):
+    def get_standart_paragraph(pdf_paragraph):

        """

        Brings the resulting paragraph to the standard form

        :param
-            pdfparagraph: The original, obtained after executing the formation algorithm, paragraph
+            pdf_paragraph: The original, obtained after executing the formation algorithm, paragraph

        :return
            paragraph: The resulting Standard paragraph
@@ -300,52 +308,43 @@ class PDFParser:
        """

        text = ""
-        for line in pdfparagraph.lines:
+        for line in pdf_paragraph.lines:
            text = text + line.text
-        paragraph = Paragraph(text=text, indent=round(Class.ptToSm(pdfparagraph.indent) - 3, 2),
-                              lineSpacing=round(Class.ptToSm(pdfparagraph.line_spacing), 2),
-                              fontName=pdfparagraph.fontname, textSize=round(pdfparagraph.text_size),
-                              nochangeTextSize=pdfparagraph.no_change_text_size,
-                              nochangeFontName=pdfparagraph.no_change_font_name)
+        paragraph = Paragraph(text=text, indent=round(Class.pt_to_sm(pdf_paragraph.indent) - 3, 2),
+                              line_spasing=round(Class.pt_to_sm(pdf_paragraph.line_spacing), 2),
+                              font_name=pdf_paragraph.fontname, text_size=round(pdf_paragraph.text_size),
+                              no_change_text_size=pdf_paragraph.no_change_text_size,
+                              no_change_fontname=pdf_paragraph.no_change_font_name)
        return paragraph

-    def deleteDublicatesAndAddParagraph(self, pdfparagraph, removed_tables, paragraph_id, list_of_table):
+    def delete_dublicates(self, pdf_paragraph, removed_tables, list_of_table):

        """

        Brings the resulting paragraph to the standard form

        :param
-            pdfparagraph: The original, obtained after executing the formation algorithm, paragraph
+            pdf_paragraph: The original, obtained after executing the formation algorithm, paragraph
            removed_tables: The list of already added to the list of structural elements of tables
-            paragraph_id: The number of the paragraph to be added
            list_of_table: The list of tables that have not yet been added has been added to the list of structural elements
        :return
-            paragraph_id: The number of the next paragraph
            removed_tables: The list of already added to the list of structural elements of tables
            list_of_table: The list of tables that have not yet been added has been added to the list of structural elements

        """
-        insertTable = False
-        #Checking that this paragraph is tabular and this table has already been added
+        insert_table = False
+        # Checking that this paragraph is tabular and this table has already been added
        for remove_table in removed_tables:
-            if (remove_table.table.page.bbox[3] - remove_table.table.bbox[1]) > pdfparagraph.lines[0].y0 > \
+            if (remove_table.table.page.bbox[3] - remove_table.table.bbox[1]) > pdf_paragraph.lines[0].y0 > \
                    (remove_table.table.page.bbox[3] - remove_table.table.bbox[3]) and \
-                    remove_table.table.page.page_number == pdfparagraph.lines[0].page:
-                insertTable = True
-        #Checking that this paragraph is tabular and adding a table if it has not been completed yet
+                    remove_table.table.page.page_number == pdf_paragraph.lines[0].page:
+                return None, removed_tables, list_of_table
+        # Checking that this paragraph is tabular and adding a table if it has not been completed yet
        for table in list_of_table:
-            if (table.table.page.bbox[3] - table.table.bbox[1]) > pdfparagraph.lines[0].y0 > (
+            if (table.table.page.bbox[3] - table.table.bbox[1]) > pdf_paragraph.lines[0].y0 > (
                    table.table.page.bbox[3] - table.table.bbox[3]) and table.table.page.page_number == \
-                    pdfparagraph.lines[0].page:
-                self.document.content[paragraph_id] = table
+                    pdf_paragraph.lines[0].page:
                removed_tables.append(table)
                list_of_table.remove(table)
-                insertTable = True
-        if insertTable:
-            return paragraph_id, removed_tables, list_of_table
-        self.addParagraphInDocumentWithAttribute(pdfparagraph, paragraph_id)
-        paragraph_id = paragraph_id + 1
-        return paragraph_id, removed_tables, list_of_table
-
-
+                return table, removed_tables, list_of_table
+        return pdf_paragraph, removed_tables, list_of_table
--- a/src/PDF/ParagraphLine.py
+++ b/src/PDF/ParagraphLine.py
-class Pdfparagraph:
+class PdfParagraph:

    """
    Description: A class is a pdf paragraph and its attributes

--- a/src/PDF/TestPDFParser/TestPDFParser.py
+++ b/src/PDF/TestPDFParser/TestPDFParser.py
@@ -3,13 +3,12 @@ from src.PDF.PDFParser import PDFParser
 from os import walk

 f = []
-for dirpath, dirnames, filenames in walk('C:\\Users\\Slava\\Downloads\\Telegram Desktop\\MagiRemoved'):
-#"Отчёт по практике для парсинга.pdf"
-    for filename in filenames:
-        pdfParser = PDFParser(path=dirpath+ '\\' + filename)
-        lines, listOfTable = pdfParser.getLinesAndTables()
-        spaces = pdfParser.getSpace(lines)
-        document = pdfParser.getParagraph(lines,spaces,listOfTable)
-        document.writeCSV(dirpath+ '\\' + filename + '.csv')
-        # json = document.createJsonToClasifier()
-        # Class.requestToClasify(json)
+for dir_path, dir_names, file_names in walk('C:\\Users\\Slava\\Downloads\\Telegram Desktop\\MagiRemoved'):
+    for filename in file_names:
+        pdf_parser = PDFParser(path=dir_path+ '\\' + filename)
+        lines, list_of_table = pdf_parser.get_lines_and_tables()
+        spaces = pdf_parser.get_space(lines)
+        document = pdf_parser.get_paragraph(lines,spaces,list_of_table)
+        document.write_CSV(dir_path+ '\\' + filename + '.csv')
+        json = document.createJsonToClasifier()
+        Class.requestToClasify(json)
--- a/src/PDF/Отчёт по практике для парсинга.pdf
+++ b/src/PDF/Отчёт по практике для парсинга.pdf