-
Notifications
You must be signed in to change notification settings - Fork 0
/
doctest1.py
259 lines (213 loc) · 12.2 KB
/
doctest1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
from docx import Document
import os
import requests
class Execute:
'''
Execute Paragraphs KeyWords Replace
paragraph: docx paragraph
'''
def __init__(self, paragraph):
self.paragraph = paragraph
def p_replace(self, x:int, key:str, value:str):
'''
paragraph replace
The reason why you do not replace the text in a paragraph directly is that it will cause the original format to
change. Replacing the text in runs will not cause the original format to change
:param x: paragraph id
:param key: Keywords that need to be replaced
:param value: The replaced keywords
:return:
'''
# Gets the coordinate index values of all the characters in this paragraph [{run_index , char_index}]
p_maps = [{"run": y, "char": z} for y, run in enumerate(self.paragraph.runs) for z, char in enumerate(list(run.text))]
# Handle the number of times key occurs in this paragraph, and record the starting position in the list.
# Here, while self.text.find(key) >= 0, the {"ab":"abc"} term will enter an endless loop
# Takes a single paragraph as an independent body and gets an index list of key positions within the paragraph, or if the paragraph contains multiple keys, there are multiple index values
k_idx = [s for s in range(len(self.paragraph.text)) if self.paragraph.text.find(key, s, len(self.paragraph.text)) == s]
for i, start_idx in enumerate(reversed(k_idx)): # Reverse order iteration
end_idx = start_idx + len(key) # The end position of the keyword in this paragraph
k_maps = p_maps[start_idx:end_idx] # Map Slice List A list of dictionaries for sections that contain keywords in a paragraph
self.r_replace(k_maps, value)
# print(f"\t |Paragraph {x+1: >3}, object {i+1: >3} replaced successfully! | {key} ===> {value}")
def r_replace(self, k_maps:list, value:str):
'''
:param k_maps: The list of indexed dictionaries containing keywords, e.g:[{"run":15, "char":3},{"run":15, "char":4},{"run":16, "char":0}]
:param value:
:return:
Accept arguments, removing the characters in k_maps from back to front, leaving the first one to replace with value
Note: Must be removed in reverse order, otherwise the list length change will cause IndedxError: string index out of range
'''
for i, position in enumerate(reversed(k_maps), start=1):
y, z = position["run"], position["char"]
run:object = self.paragraph.runs[y] # "k_maps" may contain multiple run ids, which need to be separated
# Pit: Instead of the replace() method, str is converted to list after a single word to prevent run.text from making an error in some cases (e.g., a single run contains a duplicate word)
thisrun = list(run.text)
if i < len(k_maps):
thisrun.pop(z) # Deleting a corresponding word
if i == len(k_maps): # The last iteration (first word), that is, the number of iterations is equal to the length of k_maps
thisrun[z] = value # Replace the word in the corresponding position with the new content
run.text = ''.join(thisrun) # Recover
class WordReplace:
'''
file: Microsoft Office word file,only support .docx type file
'''
def __init__(self, file):
self.docx = Document(file)
def body_content(self, replace_dict:dict):
print("\t☺Processing keywords in the body...")
for key, value in replace_dict.items():
for x, paragraph in enumerate(self.docx.paragraphs):
Execute(paragraph).p_replace(x, key, value)
print("\t |Body keywords in the text are replaced!")
def body_tables(self,replace_dict:dict):
print("\t☺Processing keywords in the body'tables...")
for key, value in replace_dict.items():
for table in self.docx.tables:
for row in table.rows:
for cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
Execute(paragraph).p_replace(x, key, value)
print("\t |Body'tables keywords in the text are replaced!")
def header_content(self,replace_dict:dict):
print("\t☺Processing keywords in the header'body ...")
for key, value in replace_dict.items():
for section in self.docx.sections:
for x, paragraph in enumerate(section.header.paragraphs):
Execute(paragraph).p_replace(x, key, value)
print("\t |Header'body keywords in the text are replaced!")
def header_tables(self,replace_dict:dict):
print("\t☺Processing keywords in the header'tables ...")
for key, value in replace_dict.items():
for section in self.docx.sections:
for table in section.header.tables:
for row in table.rows:
for cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
Execute(paragraph).p_replace(x, key, value)
print("\t |Header'tables keywords in the text are replaced!")
def footer_content(self, replace_dict:dict):
print("\t☺Processing keywords in the footer'body ...")
for key, value in replace_dict.items():
for section in self.docx.sections:
for x, paragraph in enumerate(section.footer.paragraphs):
Execute(paragraph).p_replace(x, key, value)
print("\t |Footer'body keywords in the text are replaced!")
def footer_tables(self, replace_dict:dict):
print("\t☺Processing keywords in the footer'tables ...")
for key, value in replace_dict.items():
for section in self.docx.sections:
for table in section.footer.tables:
for row in table.rows:
for cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
Execute(paragraph).p_replace(x, key, value)
print("\t |Footer'tables keywords in the text are replaced!")
class WordReplacer:
def __init__(self, file):
self.docx = Document(file)
def replace_in_paragraph(self, paragraph, replace_dict):
for idx, para in enumerate(self.docx.paragraphs):
if para.text == paragraph:
Execute(para).p_replace(idx, paragraph, replace_dict)
break
for table in self.docx.tables:
for row in table.rows:
for cell in row.cells:
for cell_paragraph in cell.paragraphs:
if cell_paragraph.text == paragraph:
Execute(cell_paragraph).p_replace(0, cell_paragraph.text, replace_dict)
for section in self.docx.sections:
for header_paragraph in section.header.paragraphs:
if header_paragraph.text == paragraph:
Execute(header_paragraph).p_replace(0, header_paragraph.text, replace_dict)
for footer_paragraph in section.footer.paragraphs:
if footer_paragraph.text == paragraph:
Execute(footer_paragraph).p_replace(0, footer_paragraph.text, replace_dict)
for header_table in section.header.tables:
for row in header_table.rows:
for cell in row.cells:
for cell_paragraph in cell.paragraphs:
if cell_paragraph.text == paragraph:
Execute(cell_paragraph).p_replace(0, cell_paragraph.text, replace_dict)
for footer_table in section.footer.tables:
for row in footer_table.rows:
for cell in row.cells:
for cell_paragraph in cell.paragraphs:
if cell_paragraph.text == paragraph:
Execute(cell_paragraph).p_replace(0, cell_paragraph.text, replace_dict)
def save(self, filepath:str):
'''
:param filepath: File saving path
:return:
'''
print(filepath)
self.docx.save(filepath)
@staticmethod
def docx_list(dirPath):
'''
:param dirPath:
:return: List of docx files in the current directory
'''
fileList = []
for roots, dirs, files in os.walk(dirPath):
for file in files:
if file.endswith("docx") and file[0] != "~": # Find the docx document and exclude temporary files
fileRoot = os.path.join(roots, file)
fileList.append(fileRoot)
print("This directory finds a total of {0} related files!".format(len(fileList)))
return fileList
def main():
'''
To use: Modify the values in replace dict and filedir
replace_dict :key:to be replaced, value:new content
filedir :Directory where docx files are stored. Subdirectories are supported
'''
# Quan dir
# filedir = r"C:\Users\quank\Documents\rmit\engineering science\architndesign\word_file"
# Long dir
# filedir = "/Users/phamlong/Desktop/RMIT/Architecture and Design/Sample Doc"
# Directory where docx files are stored. Subdirectories are supported
filedir = r"C:\Users\quank\Documents\rmit\engineering science\architndesign\word2"
filedir2=r"C:\Users\quank\Documents\rmit\engineering science\architndesign\word2\test.docx"
# Define the API endpoint for code generation
api_url = "https://3c92-103-253-89-37.ngrok-free.app/generate_code?max_length=512"
#for i, file in enumerate(WordReplacer.docx_list(filedir), start=1):
#print(f"{i} Processing file: {file}")
# Load the Word document
#word_replacer = WordReplacer(filedir2)
word_replacer = WordReplacer(filedir2)
# Extract all paragraphs from the document
paragraphs = [paragraph.text for paragraph in word_replacer.docx.paragraphs]
print(paragraphs[1])
table_texts = []
for table in word_replacer.docx.tables:
for row in table.rows:
row_text = [cell.text for cell in row.cells]
for text in row_text:
table_texts.append(text)
# Create a list of prompts
prompts_list = [f"Correct English grammar in the following text keep curly brackets keep it in one paragraph: {paragraph}\nHere is the corrected version: " for paragraph in paragraphs]
# table still testing
prompts_list_table = [f"Correct only grammar in the following text if needed do not define or add information keep it in one paragraph: {table_text}.\nHere is the corrected version: " for table_text in table_texts]
all_prompts_list = prompts_list + prompts_list_table
# Define API parameters
api_params = {'prompts': all_prompts_list}
# Send a GET request to the API
response = requests.get(api_url, params=api_params)
# Check the status code and response content
if response.status_code == 200:
corrected_paragraphs = response.json()
all_text = paragraphs + table_texts
# Replace original paragraphs with corrected paragraphs
for i, (original, corrected) in enumerate(zip(all_text, corrected_paragraphs), start=1):
word_replacer.replace_in_paragraph(original, corrected)
print(f"Paragraph {i}: Replaced successfully!")
# Save the document with replaced paragraphs
output_filepath = f"document_updated.docx"
word_replacer.save(output_filepath)
print(f"Saved updated document to: {output_filepath}\n")
else:
print("Failed to retrieve corrections. Status code:", response.status_code)
if __name__ == "__main__":
main()
print("All complete!")