Полный код можно просмотреть здесь: https://pastebin.com/1bfSidiL
Установите Python.
Код ниже заменит диакритические знаки (знаки ударения).
Точнее, такие символы, как (Ă, ă, Î, î, Ș, ș, Ț, ț, Â, â) будут заменить на (A, a, I, i, S, s, T, t, A, a) в тегах html <title></title> и < span class="text_obisnuit2"><meta name="description" контент="
import requests import re import os cale_folder_html = r"d:\\Folder1" extension_file = ".html" or ".htm" def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8') as f: text = f.read() return text def write_to_file(text, file_path): """ Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """ with open(file_path, 'wb') as f: f.write(text.encode('utf8', 'ignore')) print('Going through folder') amount = 1 for filename in os.listdir(cale_folder_html): if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html': continue if filename.endswith('.html') or filename.endswith('.htm'): cale_fisier_html = cale_folder_html + "\\" + filename html_text = read_text_from_file(cale_fisier_html) # preluam description meta_description = re.search('<meta name="description".+>', html_text)[0] description_pattern = re.compile('<meta name="description" content="(.*?)>') description = re.findall(description_pattern, html_text) if len(description) != 0: description = description[0] title_pattern = re.compile('<title>(.*?)</title>') title = re.search('<title>.+</title>', html_text)[0] title_text = re.findall(title_pattern, html_text) if len(title_text) != 0: title_text = title_text[0] # prelucrare continut dict_simboluri = dict() dict_simboluri['ă'] = 'a' dict_simboluri['â'] = 'a' dict_simboluri['ã'] = 'a' dict_simboluri['â'] = 'a' dict_simboluri['ă'] = 'a' dict_simboluri['â'] = 'a' dict_simboluri['?'] = 'a' dict_simboluri['?'] = 'a' dict_simboluri['â'] = 'a' dict_simboluri['a'] = 'a' dict_simboluri['ã'] = 'a' dict_simboluri['à'] = 'a' dict_simboluri['á'] = 'a' dict_simboluri['å'] = 'a' dict_simboluri['ä'] = 'a' dict_simboluri['â'] = 'a' dict_simboluri['…'] = '' dict_simboluri['…'] = '' dict_simboluri['\"'] = '' dict_simboluri['–'] = '- ' dict_simboluri[' '] = ' ' dict_simboluri[' '] = ' ' dict_simboluri[' '] = ' ' dict_simboluri['''] = '\'' dict_simboluri['"'] = '\'' dict_simboluri['"'] = '\'' dict_simboluri['['] = '' dict_simboluri[']'] = '' dict_simboluri['/'] = '' dict_simboluri['}'] = '' dict_simboluri['{'] = '' dict_simboluri['î'] = 'i' dict_simboluri['Î'] = 'i' dict_simboluri['î'] = 'i' dict_simboluri['î'] = 'i' dict_simboluri['Î'] = 'i' dict_simboluri['Î'] = 'i' dict_simboluri['î'] = 'i' dict_simboluri['Î'] = 'i' dict_simboluri['?'] = 'i' dict_simboluri['î'] = 'i' dict_simboluri['Î'] = 'I' dict_simboluri['I'] = 'I' dict_simboluri['Ĩ'] = 'I' dict_simboluri['Î'] = 'I' dict_simboluri['Î'] = 'I' dict_simboluri['i'] = 'i' dict_simboluri['i'] = 'i' dict_simboluri['í'] = 'i' dict_simboluri['!'] = ' ' dict_simboluri['('] = '-' dict_simboluri[')'] = ' ' dict_simboluri[' '] = ' ' dict_simboluri[',,'] = ' ' dict_simboluri['I'] = 'I' dict_simboluri['é'] = 'e' dict_simboluri['ê'] = 'e' dict_simboluri['é'] = 'e' dict_simboluri['a©'] = 'e' dict_simboluri['è'] = 'e' dict_simboluri['ë'] = 'e' dict_simboluri['Ë'] = 'e' dict_simboluri['ș'] = 's' dict_simboluri['Ș'] = 's' dict_simboluri['Ş'] = 's' dict_simboluri['ș'] = 's' dict_simboluri['ş'] = 's' dict_simboluri['s'] = 's' dict_simboluri['?'] = 's' dict_simboluri['S'] = 'S' dict_simboluri['?'] = 'S' dict_simboluri['?'] = 'S' dict_simboluri['š'] = 's' dict_simboluri['s'] = 's' dict_simboluri['?'] = 's' dict_simboluri['?'] = 's' dict_simboluri['"'] = '' dict_simboluri['’'] = '' dict_simboluri['”'] = '' dict_simboluri['’'] = '' dict_simboluri['„'] = '' dict_simboluri['“'] = '' dict_simboluri['„'] = '' dict_simboluri['“'] = '' dict_simboluri['”'] = '' dict_simboluri['<'] = '' dict_simboluri['<'] = '' dict_simboluri['«'] = '' dict_simboluri['»'] = '' dict_simboluri['"'] = '' dict_simboluri['"'] = '' dict_simboluri['"'] = '' dict_simboluri[':'] = '' dict_simboluri['&'] = '' dict_simboluri['ț'] = 't' dict_simboluri['ţ'] = 't' dict_simboluri['Ţ'] = 't' dict_simboluri['ț'] = 't' dict_simboluri['t'] = 't' dict_simboluri['?'] = 't' dict_simboluri['T'] = 'T' dict_simboluri['?'] = 'T' dict_simboluri['t'] = 't' dict_simboluri['?'] = 't' for simbol in dict_simboluri.keys(): description = description.replace(simbol, dict_simboluri[simbol]) for simbol in dict_simboluri.keys(): title_text = title_text.replace(simbol, dict_simboluri[simbol]) print(title_text) #meta_description = re.search('<meta name="description".+>', html_text)[0] new_meta_description = re.sub(r'content=".+"', f'content="{description}"', meta_description) new_title = re.sub(r'<title>.+</title>', f'<title>{title_text}</title>', title) html_text = html_text.replace(meta_description, new_meta_description) html_text = html_text.replace(title, new_title) print(f'{filename} parsed ({amount})') amount += 1 write_to_file(html_text, cale_fisier_html) else: print("Text has no description") else: continue
That's all folks.
Также ознакомьтесь с этой ВЕРСИЕЙ 2 или ВЕРСИЯ 3 или ВЕРСИЯ 4 или ВЕРСИЯ 5 или ВЕРСИЯ 6 или ВЕРСИЯ 7