J'ai rencontré un gros problème dans mon code.
TL; DR: Après quelques commentaires, j'ai décidé de poster le code complet ici:
https://repl.it/repls/AustereShinyBetatest
Voici mon code:
def highlight_nonmodified(content: str) -> str:
regex = re.compile(r'(?s)(\{.*?[^\}]+\})', re.I | re.S)
replace = r'#\1'
content = regex.sub(replace, content)
return content
def get_line(string_t: str, original: str) -> int:
original = original.splitlines(True)
for (i, line) in enumerate(original, 1):
if string_t[1:] in line:
return i
return -1
def highligh_merge(original: str, modified: str) -> str:
for line in modified.splitlines(True):
if line.startswith('#'):
numer = get_line(line, original)
error = r"#Tag not supported at line{0}\n".format(numer)
error = error + line
modified = modified.replace(line, error)
Mon problème est que voici ce qui se passe:
Textfile.txt (original):
1. Here goes some text. {tag} A wonderful day. It's soon cristmas.
2. Happy 2019, soon. {Some useful tag!} Something else goes here.
3. Happy ending. Yeppe! See you.
4.
5 Happy KKK!
6. Happy B-Day!
7
8. Universe is cool!
9.
10. {Tagish}.
11.
12. {Slugish}. Here goes another line. {Slugish} since this is a new sentence.
13.
14. endline.
Modified.txt:
Here goes some text. A wonderful day. It's soon cristmas.
Happy 2019, soon. #{Some useful tag!} Something else goes here.
Happy ending. Yeppe! See you.
Happy KKK!
Happy B-Day!
Universe is cool!
.
#Error: Tag not supported at line-1\n#{Slugish}. Here goes another line. #{Slugish} since this is a new sentence.
endline.
Je n'arrive pas à obtenir une numérotation précise des lignes et une comparaison des lignes, qu'est-ce que je fais mal ici, je stocke évidemment deux copies, l'original et le modifié, puis je choisis ensuite le numéro texte original en boucle ligne par ligne. Mais toujours sans succès, est-ce même possible. Merci beaucoup d'avance!
Je ne pense pas que cela puisse être fait si des morceaux de texte multilignes ont été supprimés. Cependant, si vous contrôlez le processus de marquage, vous pouvez inclure le numéro de ligne d'origine dans le tag:
{ foo:12 }
et puis le récupérer est trivial
original = int(re.search(r'\d+', tag).group(0))
Cette version modifiée de votre code:
import re
def annotate_tags(content: str) -> str:
"""Annotate tags with line numbers."""
tag_pattern = re.compile(r'(\{(?P<tag_value>[^}]+)\})')
lines = content.splitlines(True)
annotated_lines = []
for idx, line in enumerate(lines, 1):
annotated_lines.append(tag_pattern.sub(r'{\g<tag_value>:%s}' % idx, line))
annotated = ''.join(annotated_lines)
return annotated
def modify(content: str) -> str:
supported_tags = {
re.compile(r'(\{tag:\d+\})'): r'',
re.compile(r'(\{Tagish:\d+\})'): r''
}
for pattern, replace in supported_tags.items():
matches = pattern.findall(content)
if matches:
content = pattern.sub(replace, content)
return content
def highlight_nonmodified(content: str) -> str:
regex = re.compile(r'(?s)(\{.*?[^\}]+\})', re.I | re.S)
replace = r'#\1'
content = regex.sub(replace, content)
return content
def get_line(string_t: str, original: str) -> int:
tag_pattern = re.compile(r'(\{[^}]+:(?P<line_no>\d+)\})')
match = tag_pattern.search(string_t)
if match:
return match.group('line_no')
return -1
def highlight_merge(original: str, modified: str) -> str:
tag_regex = re.compile(r'#(?s)(\{.*?[^\}]+\})', re.I | re.S)
for line in modified.splitlines(True):
if tag_regex.search(line):
numer = get_line(line, original)
error = "#Tag not supported at line{0}\n".format(numer)
error = error + line
modified = modified.replace(line, error)
return modified
if __== '__main__':
file = 'textfile.txt'
raw = ""
with open(file, 'rt', encoding='utf-8') as f:
for i, s in enumerate(f, 1):
raw += "{}. {}".format(i, s)
original = modified = raw
modified = annotate_tags(modified)
modified = modify(modified)
modified = highlight_nonmodified(modified)
modified = highlight_merge(original, modified)
with open("modified.txt", 'w', encoding='utf-8') as f:
f.write(modified)
Génère cette sortie:
1. Here goes some text. A wonderful day. It's soon cristmas.
#Tag not supported at line2
2. Happy 2019, soon. #{Some useful tag!:2} Something else goes here.
3. Happy ending. Yeppe! See you.
4.
#Tag not supported at line5
5. #{begin:5}
6. Happy KKK!
7. Happy B-Day!
#Tag not supported at line8
8. #{end:8}
9.
10. Universe is cool!
11.
12. .
13.
#Tag not supported at line14
14. #{Slugish:14}. Here goes another line. #{Slugish:14} since this is a new sentence.
15.
16. endline.
Vous trouverez ci-dessous un court script permettant d'importer les fichiers, de nettoyer les données, de créer des dictionnaires énumérés et de générer des résultats (facultatif, basé sur la variable print_results).
(Si je n'interprète pas correctement votre question, merci de me le faire savoir!)
import re
from os import path
"""
Create an error class for trying to close a file that isn't open.
"""
class FileException(Exception):
pass
class FileNotOpenError(FileException):
pass
"""
Input variables. base_path is just the directory where your files are located.
If they are in different directories, then use a second variable.
"""
base_path = r'C:\..\[folder containing text files]'
original_filename = 'test_text.txt'
modified_filename = 'modified_text.txt'
def import_data(file_name, root=base_path):
"""
Read each text file into a list of lines.
"""
full_path = path.join(root, file_name)
with open(full_path, 'r') as f:
data = f.readlines()
try:
f.close()
except FileNotOpenError:
pass
if len(data) > 0:
return data
def remove_numbering(input):
"""
RegEx to clean data; This will remove only the line numbers and not
any subsequent number-period combinations in the line.
"""
p = re.compile(r'^([0-9]+[.]?\s)')
return p.sub('', input)
def text_dict(text_list):
"""
Remove numbering from either file; Considers period punctuation following number.
"""
new_text = [remove_numbering(i).lstrip() for i in text_list]
return {idx+1:val for idx, val in enumerate(new_text)}
def compare_files(original, modified, missing_list=None):
# Create a fresh list (probably not necessary)
if missing_list is None:
missing_list = list()
# Ensure that data types are dictionaries.
if isinstance(original, dict) and isinstance(_modified, dict):
# Use list comprehension to compare lines in each file.
# Modified line numbers will end up in a list, which we will return.
modified_index_list = [idx for idx in original.keys() if original[idx] != modified[idx]]
# Check to see if list exists; Return it if it does.
# if len(modified_index_list) > 0:
if not modified_index_list is None:
return modified_index_list
def comparison_findings(missing_list, original_dict, modified_dict):
print('Modifications found on lines:\n- ' + '\n- '.join([str(i) for i in missing_list]))
print('\n\n\tOriginal:\n')
max_len = max([len(original_dict[i].replace('\n','').rstrip()) for i in original_dict.keys() if i in missing_list])
print('\t\t{0:^7}{1:^{x}}'.format('Line','Value',x=max_len))
for i in missing_list:
temp_val = original_dict[i].replace('\n','').rstrip()
print('\t\t{0:>5}{1:2}{2:<{x}}'.format(str(i), '', temp_val, x=max_len))
print('\n\n\tModified:\n')
max_len = max([len(modified_dict[i].replace('\n','').rstrip()) for i in modified_dict.keys() if i in missing_list])
print('\t\t{0:^7}{1:^{x}}'.format('Line','Value',x=max_len))
for i in xyz:
temp_val = modified_dict[i].replace('\n','').rstrip()
print('\t\t{0:>5}{1:2}{2:<{x}}'.format(str(i), '', temp_val, x=max_len))
if __== '__main__':
print_results = True
# Import text files.
orig_data = import_data(original_filename)
mod_data = import_data(modified_filename)
# Create enumerated dictionaries from text files.
_original = text_dict(orig_data)
_modified = text_dict(mod_data)
# Get a list of modified lines.
mod_list = compare_files(_original, _modified)
# Output results of file comparison.
if print_results:
comparison_findings(mod_list, _original, _modified)
Lorsque vous appelez la fonction get_line
dans highligh_merge
, vous l'exécutez avec la variable line
modifiée afin que line
ne soit jamais réellement dans le fichier texte d'origine. Si vous regardez la valeur de line
:
#{Slugish}. Here goes another line. #{Slugish} since this is a new sentence.
Vous pouvez voir que ce n'est clairement pas dans le fichier original textfile.txt. Par conséquent, cela renvoie un numéro de ligne égal à -1.
Une solution à cela serait de changer la boucle for
dans votre fonction highligh_merge
en:
for line in modified.splitlines(True):
À:
for numer, line in enumerate(modified.splitlines(True)):
Maintenant, numer
dans chaque itération est égal au nombre de lignes - 1. Utilisez simplement numer + 1
pour obtenir le nombre exact de lignes de la ligne que vous traitez.
J'espère que ça aide. :)