Commit 4a9fd476 authored by slavamarcin's avatar slavamarcin
Browse files

[Fix] Brought the names of variables and methods to snake_case.

[Fix] Changed the method for removing duplicates of table paragraphs so that the addition of a structural element takes place outside of it
parent 96a5dee4
Showing with 349 additions and 342 deletions
+349 -342
......@@ -18,22 +18,22 @@ Parameters:
Methods
----------
addContent(id, paragraph)
add_content(id, paragraph)
Adds a paragraph to the content list
ptToSm(value)
pt_to_sm(value)
Converts topographical points to centimeters
dmToSm(value)
dm_to_sm(value)
Converts inches to centimeters
createJsonToClasifier(listOfAttr)
create_json_to_clasifier(listOfAttr)
Creates and returns a json string, which will later be sent to the classifier
requestToClasify(jsonText, api =)
request_to_clasify(jsonText, api =)
Sends a request to the classification module
writeCSV(path):
write_CSV(path):
Generates csv file based on content
"""
......@@ -43,7 +43,7 @@ class Class:
self.__time = time
self.__content = {}
def addContent(self, paragraph_id, paragraph):
def add_content(self, paragraph_id, paragraph):
"""
Adds a paragraph to the content list
......@@ -55,7 +55,7 @@ class Class:
self.content[paragraph_id] = paragraph
## Пункт в сантиметры
@classmethod
def ptToSm(cls, value):
def pt_to_sm(cls, value):
"""
Converts topographical points to centimeters
......@@ -68,7 +68,7 @@ class Class:
return value/28.346
## Дюйм в сантиметры
@classmethod
def dmToSm(cls, value):
def dm_to_sm(cls, value):
"""
Converts inches to centimeters
......@@ -80,18 +80,18 @@ class Class:
"""
return value * 2.54
def createJsonToClasifier(self, listOfAttr = ["countOfSpSbl","countSbl","lowercase","uppercase","lastSbl",
"firstkey","prevEl","curEl","nexEl","bold","italics",
"keepLinesTogether","keepWithNext", "outlineLevel",
"pageBreakBefore"]
def create_json_to_clasifier(self, list_of_attr = ["countn_of_sp_sbl","count_sbl","lowercase","uppercase","last_sbl",
"firstkey","prev_el","cur_el","next_el","bold","italics",
"keep_lines_together","keep_with_next", "outline_level",
"page_breake_before"]
):
"""
Creates and returns a json string, which will later be sent to the classifier
:param listOfAttr: List of attributes included in json string
:param list_of_attr: List of attributes included in json string
:return jsonText: Generated Json string
:return json_text: Generated Json string
"""
......@@ -104,22 +104,22 @@ class Class:
if p.__class__ != PDFTable :
s = s + "\"" + str(i) + "\": {\""
for attribute in dir(p):
if not attribute.startswith('_') and attribute in listOfAttr:
if not attribute.startswith('_') and attribute in list_of_attr:
s = s + attribute + "\": \"" + str(getattr(p,attribute)) + "\",\""
l = len(s)
s = s[:l - 2] + "}, "
l = len(s)
s = s[:l - 2] + "}}"
jsonText = json.loads(s)
return jsonText
json_text = json.loads(s)
return json_text
@classmethod
def requestToClasify(cls, jsonText, api = "http://127.0.0.1:8001/clasify"):
def request_to_clasify(cls, json_text, api = "http://127.0.0.1:8001/clasify"):
"""
Sends a request to the classification module
:param jsonText: The json string to send
:param json_text: The json string to send
:param api: API where the request is sent
:return response: Response received from the API
......@@ -127,10 +127,10 @@ class Class:
"""
import requests
response = requests.post(api, json= jsonText)
response = requests.post(api, json= json_text)
return response
def writeCSV(self, path = 'pdftocsv.csv'):
def write_CSV(self, path = 'pdftocsv.csv'):
"""
......@@ -143,16 +143,16 @@ class Class:
import csv
with open(path, 'w', newline='', encoding="utf-8") as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
filewriter.writerow(["text","countOfSpSbl","countSbl","uppercase", "lowercase","fontName","lastSbl",
"firstkey","indent","lineSpacing","textSize"])
filewriter.writerow(["text","countn_of_sp_sbl","count_sbl","uppercase", "lowercase","font_name","last_sbl",
"firstkey","indent","line_spasing","text_size"])
for key in self.content.keys():
if type(self.content.get(key))!= PDFTable:
filewriter.writerow([self.content.get(key).text, self.content.get(key).countOfSpSbl ,
self.content.get(key).countSbl,self.content.get(key).uppercase,
self.content.get(key).lowercase, self.content.get(key).fontName,
self.content.get(key).lastSbl,self.content.get(key).firstkey,
self.content.get(key).indent, self.content.get(key).lineSpacing,
self.content.get(key).textSize])
filewriter.writerow([self.content.get(key).text, self.content.get(key).countn_of_sp_sbl,
self.content.get(key).count_sbl,self.content.get(key).uppercase,
self.content.get(key).lowercase, self.content.get(key).font_name,
self.content.get(key).last_sbl,self.content.get(key).firstkey,
self.content.get(key).indent, self.content.get(key).line_spasing,
self.content.get(key).text_size])
else:
filewriter.writerow([self.content.get(key).text])
......
This diff is collapsed.
......@@ -3,7 +3,7 @@ import pdfplumber
from src.Class.DocumentClass import Class
from src.Class.Paragraph import Paragraph
from src.PDF.Line import Line
from src.PDF.ParagraphLine import Pdfparagraph
from src.PDF.ParagraphLine import PdfParagraph
from src.PDF.Table import PDFTable
......@@ -58,11 +58,11 @@ class PDFParser:
@path.setter
def path(self, path):
self._path = path
@path.setter
def prevEl(self, path):
self._path = path
@property
def pdf(self):
return self._pdf
......@@ -95,7 +95,7 @@ class PDFParser:
def list_of_table(self, list_of_table):
self._list_of_table = list_of_table
def getLinesAndTables(self):
def get_lines_and_tables(self):
"""
......@@ -117,7 +117,7 @@ class PDFParser:
no_change_size = True
for number_of_page, page in enumerate(self.pdf.pages):
#Extracting tables and tabular text
# Extracting tables and tabular text
tables = page.find_tables()
tables_text = page.extract_tables()
for number_of_table, table in enumerate(tables):
......@@ -125,19 +125,19 @@ class PDFParser:
current_table.addText(tables_text[number_of_table])
self.list_of_table.append(current_table)
#Selecting text strings
# Selecting text strings
text = ""
for i, char in enumerate(page.chars):
if y0 is not None:
y0 = round(y0)
#Condition for adding a character to a string
# Condition for adding a character to a string
if (round(char.get('y0')) == y0) or (int(char.get('y0')) == y0) \
or text == '−' or text == '–' or text == "•":
chars.append(char)
text = text + char.get('text')
x1 = char.get('x1')
if i != 0:
#Font and line size selection
# Font and line size selection
if char.get('fontname') not in fontname:
no_change_font_name = False
fontname.append(char.get('fontname'))
......@@ -151,7 +151,7 @@ class PDFParser:
y1 = char.get('y1')
else:
if i != 0:
#Deleting headers and footers
# Deleting headers and footers
if re.search(r'^\d+ $', text) is None and y0 > 60 and text != '':
if len(chars) != 0:
x0 = chars[0].get('x0')
......@@ -167,7 +167,7 @@ class PDFParser:
size = []
no_change_font_name = True
no_change_size = True
#Deleting empty lines
# Deleting empty lines
if text == "" and char.get('text') == ' ':
continue
chars.append(char)
......@@ -182,13 +182,14 @@ class PDFParser:
return self.lines, self.list_of_table
@staticmethod
def getSpace(lines):
def get_space(lines):
"""
Calculates the line spacing between two subsequent lines
:param lines: list of document lines
:param
lines: list of document lines
:return
spaces: List of calculated line spacing
......@@ -198,7 +199,7 @@ class PDFParser:
spaces = []
i = 0
while i < len(lines):
#If the line is the last one on the page, it is assigned a value equal to zero
# If the line is the last one on the page, it is assigned a value equal to zero
if i != len(lines) - 1 and (lines[i].y0 - lines[i + 1].y1 > 0):
spaces.append(lines[i].y0 - lines[i + 1].y1)
else:
......@@ -206,35 +207,35 @@ class PDFParser:
i = i + 1
return spaces
def addParagraphInDocumentWithAttribute(self, pdfparagraph, paragraph_id):
def add_paragraph_in_document_with_attribute(self, pdf_paragraph, paragraph_id):
"""
Calculates the properties and attributes of a paragraph and adds it to the list of structural elements of the document
:param
pdfparagraph: An object representing a paragraph highlighted by the algorithm
pdf_paragraph: An object representing a paragraph highlighted by the algorithm
paragraph_id: Id of paragraph
:return
"""
#Highlighting string attributes
no_change_font_name = pdfparagraph.lines[0].nochangeFontName
no_change_text_size = pdfparagraph.lines[0].nochangeSize
for line in pdfparagraph.lines:
# Highlighting string attributes
no_change_font_name = pdf_paragraph.lines[0].nochangeFontName
no_change_text_size = pdf_paragraph.lines[0].nochangeSize
for line in pdf_paragraph.lines:
if len(line.fontname) > 1 or line.nochangeFontName is False:
no_change_font_name = False
if len(line.size) > 1 or line.nochangeSize is False:
no_change_text_size = False
if len(pdfparagraph.lines[0].size) != 0:
pdfparagraph.text_size = pdfparagraph.lines[0].size[0]
if len(pdfparagraph.lines[0].fontname) != 0:
pdfparagraph.fontname = pdfparagraph.lines[0].fontname[0]
pdfparagraph.no_change_font_name = no_change_font_name
pdfparagraph.no_change_text_size = no_change_text_size
pdfparagraph.indent = pdfparagraph.lines[0].x0
self.document.content[paragraph_id] = self.getStandartParagraph(pdfparagraph)
def getParagraph(self, lines, spaces, list_of_table):
if len(pdf_paragraph.lines[0].size) != 0:
pdf_paragraph.text_size = pdf_paragraph.lines[0].size[0]
if len(pdf_paragraph.lines[0].fontname) != 0:
pdf_paragraph.fontname = pdf_paragraph.lines[0].fontname[0]
pdf_paragraph.no_change_font_name = no_change_font_name
pdf_paragraph.no_change_text_size = no_change_text_size
pdf_paragraph.indent = pdf_paragraph.lines[0].x0
self.document.content[paragraph_id] = self.get_standart_paragraph(pdf_paragraph)
def get_paragraph(self, lines, spaces, list_of_table):
"""
Generates paragraphs from a list of lines
......@@ -250,7 +251,7 @@ class PDFParser:
"""
i = 1
paragraph = Pdfparagraph()
paragraph = PdfParagraph()
paragraph.lines.append(lines[0])
paragraph.spaces.append(spaces[0])
paragraph_id = 1
......@@ -261,19 +262,26 @@ class PDFParser:
while j < len(paragraph.lines) - 1:
mean = mean + paragraph.spaces[j]
j = j + 1
#Calculating the average value of the line spacing
# Calculating the average value of the line spacing
if len(paragraph.lines) - 1 > 1:
mean = mean / (len(paragraph.lines) - 1)
if mean == 0:
mean = spaces[i - 1]
if spaces[i - 1] == 0:
spaces[i - 1] = mean
#Condition for paragraph selection
# Condition for paragraph selection
if (lines[i - 1].x0 < lines[i].x0 or lines[i - 1].x1 <= 520 or abs(spaces[i - 1] - mean) > 2 or (
len(paragraph.lines) == 1 and paragraph.lines[0].x0 == lines[i].x0)):
paragraph.line_spacing = mean
paragraph_id, removed_tables, list_of_table = self.deleteDublicatesAndAddParagraph(paragraph, removed_tables, paragraph_id, list_of_table)
paragraph = Pdfparagraph()
element, removed_tables, list_of_table = self.delete_dublicates(paragraph, removed_tables,
list_of_table)
if element is not None:
if type(element) == PDFTable:
self.document.add_content(paragraph_id, element)
else:
self.add_paragraph_in_document_with_attribute(element, paragraph_id)
paragraph_id += 1
paragraph = PdfParagraph()
paragraph.lines.append(lines[i])
paragraph.spaces.append(spaces[i])
else:
......@@ -285,14 +293,14 @@ class PDFParser:
return self.document
@staticmethod
def getStandartParagraph(pdfparagraph):
def get_standart_paragraph(pdf_paragraph):
"""
Brings the resulting paragraph to the standard form
:param
pdfparagraph: The original, obtained after executing the formation algorithm, paragraph
pdf_paragraph: The original, obtained after executing the formation algorithm, paragraph
:return
paragraph: The resulting Standard paragraph
......@@ -300,52 +308,43 @@ class PDFParser:
"""
text = ""
for line in pdfparagraph.lines:
for line in pdf_paragraph.lines:
text = text + line.text
paragraph = Paragraph(text=text, indent=round(Class.ptToSm(pdfparagraph.indent) - 3, 2),
lineSpacing=round(Class.ptToSm(pdfparagraph.line_spacing), 2),
fontName=pdfparagraph.fontname, textSize=round(pdfparagraph.text_size),
nochangeTextSize=pdfparagraph.no_change_text_size,
nochangeFontName=pdfparagraph.no_change_font_name)
paragraph = Paragraph(text=text, indent=round(Class.pt_to_sm(pdf_paragraph.indent) - 3, 2),
line_spasing=round(Class.pt_to_sm(pdf_paragraph.line_spacing), 2),
font_name=pdf_paragraph.fontname, text_size=round(pdf_paragraph.text_size),
no_change_text_size=pdf_paragraph.no_change_text_size,
no_change_fontname=pdf_paragraph.no_change_font_name)
return paragraph
def deleteDublicatesAndAddParagraph(self, pdfparagraph, removed_tables, paragraph_id, list_of_table):
def delete_dublicates(self, pdf_paragraph, removed_tables, list_of_table):
"""
Brings the resulting paragraph to the standard form
:param
pdfparagraph: The original, obtained after executing the formation algorithm, paragraph
pdf_paragraph: The original, obtained after executing the formation algorithm, paragraph
removed_tables: The list of already added to the list of structural elements of tables
paragraph_id: The number of the paragraph to be added
list_of_table: The list of tables that have not yet been added has been added to the list of structural elements
:return
paragraph_id: The number of the next paragraph
removed_tables: The list of already added to the list of structural elements of tables
list_of_table: The list of tables that have not yet been added has been added to the list of structural elements
"""
insertTable = False
#Checking that this paragraph is tabular and this table has already been added
insert_table = False
# Checking that this paragraph is tabular and this table has already been added
for remove_table in removed_tables:
if (remove_table.table.page.bbox[3] - remove_table.table.bbox[1]) > pdfparagraph.lines[0].y0 > \
if (remove_table.table.page.bbox[3] - remove_table.table.bbox[1]) > pdf_paragraph.lines[0].y0 > \
(remove_table.table.page.bbox[3] - remove_table.table.bbox[3]) and \
remove_table.table.page.page_number == pdfparagraph.lines[0].page:
insertTable = True
#Checking that this paragraph is tabular and adding a table if it has not been completed yet
remove_table.table.page.page_number == pdf_paragraph.lines[0].page:
return None, removed_tables, list_of_table
# Checking that this paragraph is tabular and adding a table if it has not been completed yet
for table in list_of_table:
if (table.table.page.bbox[3] - table.table.bbox[1]) > pdfparagraph.lines[0].y0 > (
if (table.table.page.bbox[3] - table.table.bbox[1]) > pdf_paragraph.lines[0].y0 > (
table.table.page.bbox[3] - table.table.bbox[3]) and table.table.page.page_number == \
pdfparagraph.lines[0].page:
self.document.content[paragraph_id] = table
pdf_paragraph.lines[0].page:
removed_tables.append(table)
list_of_table.remove(table)
insertTable = True
if insertTable:
return paragraph_id, removed_tables, list_of_table
self.addParagraphInDocumentWithAttribute(pdfparagraph, paragraph_id)
paragraph_id = paragraph_id + 1
return paragraph_id, removed_tables, list_of_table
return table, removed_tables, list_of_table
return pdf_paragraph, removed_tables, list_of_table
class Pdfparagraph:
class PdfParagraph:
"""
Description: A class is a pdf paragraph and its attributes
......
......@@ -3,13 +3,12 @@ from src.PDF.PDFParser import PDFParser
from os import walk
f = []
for dirpath, dirnames, filenames in walk('C:\\Users\\Slava\\Downloads\\Telegram Desktop\\MagiRemoved'):
#"Отчёт по практике для парсинга.pdf"
for filename in filenames:
pdfParser = PDFParser(path=dirpath+ '\\' + filename)
lines, listOfTable = pdfParser.getLinesAndTables()
spaces = pdfParser.getSpace(lines)
document = pdfParser.getParagraph(lines,spaces,listOfTable)
document.writeCSV(dirpath+ '\\' + filename + '.csv')
# json = document.createJsonToClasifier()
# Class.requestToClasify(json)
for dir_path, dir_names, file_names in walk('C:\\Users\\Slava\\Downloads\\Telegram Desktop\\MagiRemoved'):
for filename in file_names:
pdf_parser = PDFParser(path=dir_path+ '\\' + filename)
lines, list_of_table = pdf_parser.get_lines_and_tables()
spaces = pdf_parser.get_space(lines)
document = pdf_parser.get_paragraph(lines,spaces,list_of_table)
document.write_CSV(dir_path+ '\\' + filename + '.csv')
json = document.createJsonToClasifier()
Class.requestToClasify(json)
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment