Полный код можно просмотреть здесь: https://pastebin.com/VCDVmkWW
Установите Python.
THIS IS MY OLD WEBSITE DESIGN. Basically, you need to select the information from the old tags (css classes) and copy it into the new tags (css classes).
Я ПЕРЕНЕСУ ДАННЫЕ НА ЭТОТ НОВЫЙ ВЕБ-САЙТ:
------- КОД PYTHON: ---------
#------------------------------------------------------------------------------- # Author: Neculai Fantanaru # # Created: 24/03/2022 # Copyright: (c) Neculai Fantanaru # Licence: <your licence> #------------------------------------------------------------------------------- import os import re def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8') as f: text = f.read() return text def write_to_file(text, file_path): """ Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """ with open(file_path, 'wb') as f: f.write(text.encode('utf8', 'ignore')) def copiaza_continut_html(cale_fisier_html, cale_fisiere_gata): # astea sunt argumentele functiei, adica cand apelez functia # citesti textul din fisierul html text_html = read_text_from_file(cale_fisier_html) final_text = '' # === fisier html vechi === articol_categorie_pattern = re.compile('<!-- ARTICOL CATEGORIE START -->([\s\S]*?)<!-- ARTICOL CATEGORIE FINAL -->') articol_categorie = re.findall(articol_categorie_pattern, text_html) if len(articol_categorie) != 0: # === citire fisier model - index2.html === text_html_model = read_text_from_file("C:\\Folder1\\index2.html") articol_categorie = articol_categorie[0] # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ==== span_pattern = re.compile('<td><span class="den_articol"><a href=\"(.*?)\" class="linkMare">(.*?)</a></span></td>') span_nou = '<td><span class="linkMare"><a href="{}" class="linkMare"><span class="den_articol">{}</span></a></span></td>' span = re.findall(span_pattern, articol_categorie) lista_span_nou = list() for i in range(len(span)): lista_span_nou.append(span_nou.format(span[i][0], span[i][1])) span_pattern = re.compile('<td><span class="den_articol"><a href=\".*?\" class="linkMare">.*?</a></span></td>') span = re.findall(span_pattern, articol_categorie) for i in range(len(span)): articol_categorie = articol_categorie.replace(span[i], lista_span_nou[i]) # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ==== # ==== Informatii fisier original ==== categ_link_title_pattern = re.compile('<td><span class="linkMare"><a href="(.*?)" class="linkMare"><span class="den_articol">(.*?)</span></a></span></td>') categ_link_title = re.findall(categ_link_title_pattern, articol_categorie) print("Total {} ARTICOLE".format(len(categ_link_title))) categ_date_link_title_desc_pattern = re.compile('<td class="text_dreapta">(.*?)<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>, by Neculai Fantanaru</td>') categ_date_link_title_desc = re.findall(categ_date_link_title_desc_pattern, articol_categorie) paragraf_pattern = re.compile('<p class="text_obisnuit2"><em>(.*?)</em></p>') paragraf = re.findall(paragraf_pattern, articol_categorie) # === citeste mai departe - buton === citeste_buton_pattern = re.compile('<div align="right" id="external2"><a href=\"(.*?)\">citeşte mai departe </a>') citeste_buton = re.findall(citeste_buton_pattern, articol_categorie) read_more_buton_pattern = re.compile('<div align="right" id="external2"><a href=\"(.*?)\">read more </a>') read_more_buton = re.findall(read_more_buton_pattern, articol_categorie) # === Informatii index2 === articol_categorie_index2_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->') articol_categorie_index2 = re.findall(articol_categorie_index2_pattern, text_html_model) if len(articol_categorie_index2) != 0: articol_categorie_index2 = articol_categorie_index2[0] # trebuie inlocuit cu toate categoriile din fisierul original # citire template pentru categorie din index2.html template_categorie = read_text_from_file("C:\\Folder1\\template_categorie.txt") # h3 => title + description h3_pattern = re.compile('<h3 class="font-weight-normal" itemprop="name"><a href=\"(.*?)\" class="color-black">(.*?)</a></h3>') h3 = re.findall(h3_pattern, template_categorie) h3 = h3[0] # dates section din index2.html dates_section_index2_pattern = re.compile('<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->') dates_section_index2 = re.findall(dates_section_index2_pattern, template_categorie) dates_section_index2 = dates_section_index2[0] date_index2_pattern = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>') # date date_index2 = re.findall(date_index2_pattern, dates_section_index2) date_index2 = date_index2[0] # link / title / description link_title_desc_index2_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>') link_title_desc_index2 = re.findall(link_title_desc_index2_pattern, dates_section_index2) link_title_desc_index2 = link_title_desc_index2[0] # paragraf paragraf_index2_pattern = re.compile('<p class="mb-35px color-grey line-height-25px">(.*?)</p>') paragraf_index2 = re.findall(paragraf_index2_pattern, template_categorie) paragraf_index2 = paragraf_index2[0] # === read more === read_more_pattern = re.compile('<a href=\"(.*?)\" class="btn-setting color-black btn-hvr-up btn-blue btn-hvr-pink">read more</a>') read_more = re.findall(read_more_pattern, template_categorie) read_more = read_more[0] butoane = list() if len(citeste_buton) > 0: butoane = citeste_buton else: butoane = read_more_buton for i in range(len(categ_link_title)): new_template = template_categorie # === facem replace cu informatiile din articolul original === new_template_1 = new_template.replace(date_index2, categ_date_link_title_desc[i][0].replace(', in', '').strip()) new_template_2 = new_template_1.replace(link_title_desc_index2[0], categ_date_link_title_desc[i][1]) new_template_3 = new_template_2.replace(link_title_desc_index2[1], categ_date_link_title_desc[i][2]) new_template_4 = new_template_3.replace(link_title_desc_index2[2], categ_date_link_title_desc[i][3].lstrip()) new_template_5 = new_template_4.replace(paragraf_index2, paragraf[i]) new_template_6 = new_template_5.replace(read_more, butoane[i]) new_template_7 = new_template_6.replace(h3[0], categ_link_title[i][0]) new_template_8 = new_template_7.replace(h3[1], categ_link_title[i][1]) final_text = final_text + new_template_8 + '\n' text_html_model = text_html_model.replace(articol_categorie_index2, final_text) final_text = text_html_model # schimbare CATEGORIES index2 # preluare lista fisier html lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>') lista = re.findall(lista_pattern, text_html) if len(lista) != 0: lista = lista[0] elemente_lista_pattern = re.compile('<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>') elemente_lista = re.findall(elemente_lista_pattern, lista) if elemente_lista != 0: categories_pattern = re.compile('<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->') categories = re.findall(categories_pattern, final_text) if len(categories) != 0: categories = categories[0] elemente_lista_model_pattern = re.compile('<div class="categories-name">([\s\S]*?)</div>') elemente_lista_model = re.findall(elemente_lista_model_pattern, categories) template_category = read_text_from_file('C:\\Folder1\\category-name.txt') for i in range(len(elemente_lista_model)): new_template_category = template_category a_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\">') a = re.findall(a_pattern, new_template_category)[0] p_pattern = re.compile('<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>') p = re.findall(p_pattern, new_template_category)[0] new_template_category = new_template_category.replace(a[0], elemente_lista[i][0]) new_template_category = new_template_category.replace(a[1], elemente_lista[i][1]) new_template_category = new_template_category.replace(p[0], elemente_lista[i][2]) new_template_category = new_template_category.replace(p[1], elemente_lista[i][3]) # print(final_text) final_text = final_text.replace(elemente_lista_model[i], new_template_category) else: print("No categories + books start") else: print("Niciun element <li>.") else: print("Tag <ul> gol.") # Shimbare LINK-URI FLAGS flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->') flags = re.findall(flags_pattern, text_html) if len(flags) != 0: flags = flags[0] links_pattern = re.compile('<a href=\"(.*?)\">') links = re.findall(links_pattern, flags) if len(links) != 0: # print("Links: ", links) flags_model = re.findall(flags_pattern, final_text) if len(flags_model) != 0: flags_model = flags_model[0] links_pattern_model = re.compile('<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">') links_model = re.findall(links_pattern_model, flags_model) if len(links_model) != 0: for i in range(len(links)): # print(links[i], links_model[i]) final_text = final_text.replace(links_model[i], links[i]) # FACE REPLACE else: print("Fara links in flags model") else: print("Fara links in flags model") else: print("Fara linkuri in flags.") else: print("Fara flags in articol original.") # STARS - PHP stars_php_pattern = re.compile('\$item_id = (.*?);') stars_php = re.findall(stars_php_pattern, text_html) stars_php_model = re.findall(stars_php_pattern, final_text) if len(stars_php) != 0: stars_php = stars_php[0] if len(stars_php_model) != 0: stars_php_model = stars_php_model[0] final_text = final_text.replace(stars_php_model, stars_php) # FACE REPLACE else: print("No stars fisier model") else: print("No stars fisier original") # TITLE title_pattern = re.compile('<title>(.*?)</title>') text_title = re.findall(title_pattern, text_html) text_title_model = re.findall(title_pattern, final_text) if len(text_title) != 0 and len(text_title_model) != 0: text_title = text_title[0] text_title_model = text_title_model[0] final_text = final_text.replace(text_title_model, text_title) else: print("Fisier html fara tag title: {}".format(cale_fisier_html)) # DESCRIPTION description_pattern = re.compile('<meta name="description" content="(.*?)">') text_description = re.findall(description_pattern, text_html) text_description_model = re.findall(description_pattern, final_text) if len(text_description) != 0 and len(text_description_model) != 0: text_description = text_description[0] text_description_model = text_description_model[0] final_text = final_text.replace(text_description_model, text_description) else: print("Fisier html fara tag description: {}".format(cale_fisier_html)) # CANONICAL canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />') text_canonical = re.findall(canonical_pattern, text_html) text_canonical_model = re.findall(canonical_pattern, final_text) if len(text_canonical) != 0 and len(text_canonical_model) != 0: text_canonical = text_canonical[0] text_canonical_model = text_canonical_model[0] final_text = final_text.replace(text_canonical_model, text_canonical) else: print("Fisier html fara tag canonical: {}".format(cale_fisier_html)) # ULTIMELE ARTICOLE ult_art_pattern = re.compile('<!-- Ultimele articole -->([\s\S]*?)<!-- Ultimele articole final -->') ult_art_model_pattern = re.compile('<!-- Recent Post -->([\s\S]*?)<!-- Categories -->') ult_art = re.findall(ult_art_pattern, text_html) ult_art_model = re.findall(ult_art_model_pattern, final_text) if len(ult_art) != 0: ult_art = ult_art[0] if len(ult_art_model) != 0: ult_art_model = ult_art_model[0] articole_pattern = re.compile('<li><a href=\"(.*?)\">(.*?)</a></li>') articole = re.findall(articole_pattern, ult_art) if len(articole) != 0: articole_model_pattern = re.compile('<a href=\"(.*?)\" class="color-grey">(.*?)</a>') articole_model = re.findall(articole_model_pattern, ult_art_model) if len(articole_model) != 0: for i in range(len(articole)): # href - 0 / description - 1 final_text = final_text.replace(articole_model[i][0], articole[i][0]) final_text = final_text.replace(articole_model[i][1], articole[i][1]) else: print("No articole fisier model") else: print("No articole fisier original") else: print("No lista articole fisier model") else: print("No lista articole fisier original") else: print("Nu exista articol categorie in index2.html") else: # === citire fisier model - index.html === text_html_model = read_text_from_file("C:\\Folder1\\index.html") # ARTICOL START - FINAL articol_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->[\s\S]*?') text_articol = re.findall(articol_pattern, text_html) text_articol_model = re.findall(articol_pattern, text_html_model) if len(text_articol) != 0 and len(text_articol_model) != 0: text_articol = text_articol[0] text_articol_model = text_articol_model[0] text_html_model_1 = text_html_model.replace(text_articol_model, text_articol) final_text = text_html_model_1 else: print("Fisier html fara ARTICOL START/FINAL: {}".format(cale_fisier_html)) # TITLE title_pattern = re.compile('<title>(.*?)</title>') text_title = re.findall(title_pattern, text_html) text_title_model = re.findall(title_pattern, text_html_model_1) if len(text_title) != 0 and len(text_title_model) != 0: text_title = text_title[0] text_title_model = text_title_model[0] text_html_model_2 = text_html_model_1.replace(text_title_model, text_title) final_text = text_html_model_2 else: print("Fisier html fara tag title: {}".format(cale_fisier_html)) # DESCRIPTION description_pattern = re.compile('<meta name="description" content="(.*?)">') text_description = re.findall(description_pattern, text_html) text_description_model = re.findall(description_pattern, text_html_model_2) if len(text_description) != 0 and len(text_description_model) != 0: text_description = text_description[0] text_description_model = text_description_model[0] text_html_model_3 = text_html_model_2.replace(text_description_model, text_description) final_text = text_html_model_3 else: print("Fisier html fara tag description: {}".format(cale_fisier_html)) # CANONICAL canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />') text_canonical = re.findall(canonical_pattern, text_html) text_canonical_model = re.findall(canonical_pattern, text_html_model_3) if len(text_canonical) != 0 and len(text_canonical_model) != 0: text_canonical = text_canonical[0] text_canonical_model = text_canonical_model[0] text_html_model_4 = text_html_model_3.replace(text_canonical_model, text_canonical) final_text = text_html_model_4 else: print("Fisier html fara tag canonical: {}".format(cale_fisier_html)) # remove DIV tag and TABLE tag text_articol_model = re.findall(articol_pattern, text_html_model_4) text_articol_model_old = text_articol_model[0] text_articol_model = text_articol_model[0] text_articol_model = text_articol_model.replace("<div align=\"justify\">", '') text_articol_model = text_articol_model.replace("</div>", '') table_pattern = re.compile('<table[\s\S]*?</table>') text_table = re.findall(table_pattern, text_articol_model) if len(text_table) != 0: text_table = text_table[0] text_articol_model = text_articol_model.replace(text_table, '') text_html_model_5 = text_html_model_4.replace(text_articol_model_old, text_articol_model) final_text = text_html_model_5 else: print("No text table") # schimbare tag-uri ARTICLE TITLE article_title_pattern = re.compile('<h1 class="den_articol" itemprop="name">(.*?)</h1>') article_title = re.findall(article_title_pattern, text_articol_model_old) if len(article_title) != 0: article_title = article_title[0] h3_title_pattern = re.compile('<h3 class="font-weight-normal" itemprop="name"><a href="javascript:void\(0\)" class="color-black">(.*?)</a></h3>') h3_title = re.findall(h3_title_pattern, text_html_model_5) if len(h3_title) != 0: h3_title = h3_title[0] text_html_model_6 = text_html_model_5.replace(h3_title, article_title) final_text = text_html_model_6 else: print("No h3 title.") else: print("No article title.") # schimbare DATE date_pattern = re.compile('<td class="text_dreapta">(.*?), in <a') date = re.findall(date_pattern, text_articol_model_old) if len(date) != 0: date = date[0] # MODIFICARE 09/03 date_section_pattern = re.compile('<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->') date_section = re.findall(date_section_pattern, text_html_model_6) if len(date_section) > 0: date_section = date_section[0] date_pattern_model = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>') date_model = re.findall(date_pattern_model, date_section) if len(date_model) != 0: date_model = date_model[0] text_html_model_7 = text_html_model_6.replace(date_model, date) final_text = text_html_model_7 else: print('No date in model.') else: print("No date section: <!--STARTDATES--><!--FINNISHDATES-->") else: print("No date.") # schimbare SECTION section_pattern_model = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>') section_model = re.findall(section_pattern_model, text_html_model_7) # print(section_model) if len(section_model) != 0: section_model = section_model[0] section_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>') section = re.findall(section_pattern, text_articol_model_old) if len(section) != 0: section = section[0] text_html_model_8 = text_html_model_7.replace(section_model[0], section[0]) text_html_model_9 = text_html_model_8.replace(section_model[1], section[1]) text_html_model_10 = text_html_model_9.replace(section_model[2], section[2]) final_text = text_html_model_10 else: print("No section.") else: print("No section model.") # schimbare CATEGORIES # preluare lista fisier html lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>') lista = re.findall(lista_pattern, text_html) if len(lista) != 0: lista = lista[0] elemente_lista_pattern = re.compile('<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>') elemente_lista = re.findall(elemente_lista_pattern, lista) if elemente_lista != 0: categories_pattern = re.compile('<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->') categories = re.findall(categories_pattern, text_html_model_10) if len(categories) != 0: categories = categories[0] elemente_lista_model_pattern = re.compile('<div class="categories-name">([\s\S]*?)</div>') elemente_lista_model = re.findall(elemente_lista_model_pattern, categories) template_category = read_text_from_file('C:\\Folder1\\category-name.txt') for i in range(len(elemente_lista_model)): new_template_category = template_category a_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\">') a = re.findall(a_pattern, new_template_category)[0] p_pattern = re.compile('<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>') p = re.findall(p_pattern, new_template_category)[0] new_template_category = new_template_category.replace(a[0], elemente_lista[i][0]) new_template_category = new_template_category.replace(a[1], elemente_lista[i][1]) new_template_category = new_template_category.replace(p[0], elemente_lista[i][2]) new_template_category = new_template_category.replace(p[1], elemente_lista[i][3]) # print(final_text) final_text = final_text.replace(elemente_lista_model[i], new_template_category) # print("==========================") # print(final_text) text_html_model_14 = final_text else: print("No categories + books start") else: print("Niciun element <li>.") else: print("Tag <ul> gol.") # Shimbare LINK-URI FLAGS flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->') flags = re.findall(flags_pattern, text_html) if len(flags) != 0: flags = flags[0] links_pattern = re.compile('<a href=\"(.*?)\">') links = re.findall(links_pattern, flags) if len(links) != 0: # print("Links: ", links) flags_model = re.findall(flags_pattern, text_html_model_14) if len(flags_model) != 0: flags_model = flags_model[0] # print("Flags: ", flags_model) links_pattern_model = re.compile('<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">') links_model = re.findall(links_pattern_model, flags_model) # print(links_model) text_html_model_15 = text_html_model_14 if len(links_model) != 0: for i in range(len(links)): # print(links[i], links_model[i]) text_html_model_15 = text_html_model_15.replace(links_model[i], links[i]) # FACE REPLACE final_text = text_html_model_15 else: print("Fara links in flags model") else: print("Fara links in flags model") else: print("Fara linkuri in flags.") else: print("Fara flags in articol original.") # STARS - PHP stars_php_pattern = re.compile('\$item_id = (.*?);') stars_php = re.findall(stars_php_pattern, text_html) stars_php_model = re.findall(stars_php_pattern, text_html_model_15) if len(stars_php) != 0: stars_php = stars_php[0] if len(stars_php_model) != 0: stars_php_model = stars_php_model[0] text_html_model_16 = text_html_model_15.replace(stars_php_model, stars_php) # FACE REPLACE final_text = text_html_model_16 else: print("No stars fisier model") else: print("No stars fisier original") # ULTIMELE ARTICOLE ult_art_pattern = re.compile('<!-- Ultimele articole -->([\s\S]*?)<!-- Ultimele articole final -->') ult_art_model_pattern = re.compile('<!-- Recent Post -->([\s\S]*?)<!-- Categories -->') ult_art = re.findall(ult_art_pattern, text_html) ult_art_model = re.findall(ult_art_model_pattern, text_html_model_16) if len(ult_art) != 0: ult_art = ult_art[0] if len(ult_art_model) != 0: ult_art_model = ult_art_model[0] articole_pattern = re.compile('<li><a href=\"(.*?)\">(.*?)</a></li>') articole = re.findall(articole_pattern, ult_art) if len(articole) != 0: articole_model_pattern = re.compile('<a href=\"(.*?)\" class="color-grey">(.*?)</a>') articole_model = re.findall(articole_model_pattern, ult_art_model) if len(articole_model) != 0: for i in range(len(articole)): # href - 0 / description - 1 # MODIFICARE 09/03 final_text = final_text.replace(articole_model[i][0], articole[i][0]) final_text = final_text.replace(articole_model[i][1], articole[i][1]) else: print("No articole fisier model") else: print("No articole fisier original") else: print("No lista articole fisier model") else: print("No lista articole fisier original") file_path = cale_fisiere_gata + "\\" + os.path.basename(cale_fisier_html) write_to_file(final_text, file_path) print("Scriere efectuata cu succes.") def creare_fisiere_html(cale_folder_html, cale_fisiere_gata): """ Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare """ count = 0 for f in os.listdir(cale_folder_html): if f.endswith('.html'): cale_fisier_html = cale_folder_html + "\\" + f print("FISIER CURENT: ", cale_fisier_html) copiaza_continut_html(cale_fisier_html, cale_fisiere_gata) count += 1 else: continue print("Numarul de fisiere modificate: ", count) def main(): creare_fisiere_html("C:\\Folder1\\fisiere_html", "C:\\Folder1\\fisiere_gata") if __name__ == '__main__': main()
That's all folks.
Также ознакомьтесь с этой ВЕРСИЕЙ 2 или ВЕРСИЯ 3 или ВЕРСИЯ 4 или ВЕРСИЯ 5 или ВЕРСИЯ 6 или ВЕРСИЯ 7