Puteti vizualiza intregul cod aici: https://pastebin.com/hnAyWW2Q
Avem folderul FLAGS in care exista urmatoarele subfoldere: HR, ES, RU, AR, BS...
In fiecare din aceste subfoldere (HR, ES, RU, AR, BS... ) exista circa 1600 fisiere html.
In fiecare fisier html se afla urmatoarele linii, identice:
<meta http-equiv="Content-Language" content="en"/>
<meta property="og:locale" content="en"/>
https://neculaifantanaru.com/en/
OUTPUT: Codul va prelua denumirea fisierului, de exemplu (HR) din subfolderul HR, si il va introdice la sfarsitul celor 3 linii, in loc de en:
<meta http-equiv="Content-Language" content="HR"/>
<meta property="og:locale" content="HR"/>
https://neculaifantanaru.com/HR/
La fel se va intampla si cu fisierele din celelalte subfoldere ( ES, RU, AR, BS... )
< ! -- HTML generated using hilite.me -->Optional. La sfarsitul codului veti gasi cateva formule Regex, necesare pentru efectuarea operatiunilor FIND/REPLACE in fiecare fisier in parte:
< ! -- HTML generated using hilite.me -->
import requests import re import os import regex # folder principal folder_principal = "e:\\FLAGS" extension_file = ".html" def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8') as f: text = f.read() f.close() return text def write_to_file(text, file_path): """ Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """ with open(file_path, 'wb') as f: f.write(text.encode('utf8', 'ignore')) f.close() for root, dirs, files in os.walk(folder_principal): # folder principal for d in dirs: # subfoldere cale_subfolder = root + os.sep + d for root_files, dirs_files, files_files in os.walk(cale_subfolder): # fisiere din subfoldere for file_name in files_files: file_path = root_files + os.sep + file_name file_content = read_text_from_file(file_path) # <meta http-equiv="Content-Language" content="en"/> meta_lang = re.search('<meta http-equiv="Content-Language" content=".*?".*?/>', file_content) if (meta_lang == None): print("Nu am gasit --- en --- in fisierul --- {} ---.".format(file_path)) else: meta_lang = meta_lang[0] new_meta_lang = re.sub(r'content=".+"', f'content="{d.lower()}"', meta_lang) file_content = file_content.replace(meta_lang, new_meta_lang) # <meta property="og:locale" content="en"/> meta_prop_lang = re.search('<meta property="og:locale" content=".*?".*?/>', file_content) if (meta_prop_lang == None): print("Nu am gasit --- en --- in fisierul --- {} ---.".format(file_path)) else: meta_prop_lang = meta_prop_lang[0] new_meta_prop_lang = re.sub(r'content=".+"', f'content="{d.lower()}"', meta_prop_lang) file_content = file_content.replace(meta_prop_lang, new_meta_prop_lang) # https://neculaifantanaru.com/en/ link_lang_regex = re.compile('https://neculaifantanaru.com/en/') link_lang = re.findall(link_lang_regex, file_content) if (len(link_lang) == 0): print("Nu am gasit --- neculaifantanaru.com/en/ --- in fisierul --- {} ---.".format(file_path)) else: for i in range(len(link_lang)): link_lang_i = link_lang[i] new_link_lang_i = re.sub('https://neculaifantanaru.com/en/', f'https://neculaifantanaru.com/{d.lower()}/', link_lang_i) file_content = file_content.replace(link_lang_i, new_link_lang_i) # alt="fr" width="28" height="19" /></a> <a href="https://neculaifantanaru.com/af/ flags_content = re.search("<!-- FLAGS_1 -->[\s\S]*?<!-- FLAGS -->", file_content) if (flags_content == None): print("Nu am gasit --- FLAGS --- in fisierul --- {} ---.".format(file_path)) else: flags_content = flags_content[0] new_flags_content = flags_content links_flags_regex = re.compile('alt="fr" width="28" height="19" /></a> <a href="https://neculaifantanaru.com/.*?/') links_flags = re.findall(links_flags_regex, flags_content) if (len(links_flags) == 0): print("Nu am gasit --- links FLAGS --- in fisierul --- {} ---.".format(file_path)) else: for i in range(len(links_flags)): link_i = links_flags[i] new_link_i = re.sub('https://neculaifantanaru.com/.*?/', 'https://neculaifantanaru.com/en/', link_i) new_flags_content = new_flags_content.replace(link_i, new_link_i) file_content = file_content.replace(flags_content, new_flags_content) # REGEX_1 \x20\|.*(</title>) -> \1 title = re.search('\x20\|.*(</title>)', file_content) if (title == None): print("Nu am gasit --- title --- in fisierul --- {} ---.".format(file_path)) else: file_content = re.sub(r'\x20\|.*(</title>)', r'\1', file_content) # REGEX_2 <td width="149"><div class="rw-ui-container rw-class[\s\S]*?</td> file_content = re.sub(r'<td width="149"><div class="rw-ui-container rw-class[\s\S]*?</td>', '', file_content) # REGEX_3 \A(?s).*?<!DOCTYPE html>(?-s).*\R -> \r<!DOCTYPE html>\r file_content = regex.sub(r'[\s\S]*?<\?php[\s\S]*?\?>\s+', r'', file_content) write_to_file(file_content, file_path)
That's all folks.
If you like my code, then make me a favor: translate your website into Romanian, "ro".
Also, there is a VERSION X or VERSION 2 of this code or VERSION 3 or VERSION 4 or VERSION 5