Puteţi vizualiza întregul cod aici: https://pastebin.com/VCDVmkWW
Instalaţi Python.
ACESTA ESTE DESIGNUL MEU VECHI DE SITE WEB: Practic, trebuie sa selectezi informatia din tag-urile (clasele css) vechi si sa o copiezi in tag-urile (clasele css) noi.
VOI TRANSFER DATELE PE ACEST NOU SITE WEB:
------- PYTHON CODE: ---------
#------------------------------------------------------------------------------- # Author: Neculai Fantanaru # # Created: 24/03/2022 # Copyright: (c) Neculai Fantanaru # Licence: <your licence> #------------------------------------------------------------------------------- import os import re def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8') as f: text = f.read() return text def write_to_file(text, file_path): """ Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """ with open(file_path, 'wb') as f: f.write(text.encode('utf8', 'ignore')) def copiaza_continut_html(cale_fisier_html, cale_fisiere_gata): # astea sunt argumentele functiei, adica cand apelez functia # citesti textul din fisierul html text_html = read_text_from_file(cale_fisier_html) final_text = '' # === fisier html vechi === articol_categorie_pattern = re.compile('<!-- ARTICOL CATEGORIE START -->([\s\S]*?)<!-- ARTICOL CATEGORIE FINAL -->') articol_categorie = re.findall(articol_categorie_pattern, text_html) if len(articol_categorie) != 0: # === citire fisier model - index2.html === text_html_model = read_text_from_file("C:\\Folder1\\index2.html") articol_categorie = articol_categorie[0] # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ==== span_pattern = re.compile('<td><span class="den_articol"><a href=\"(.*?)\" class="linkMare">(.*?)</a></span></td>') span_nou = '<td><span class="linkMare"><a href="{}" class="linkMare"><span class="den_articol">{}</span></a></span></td>' span = re.findall(span_pattern, articol_categorie) lista_span_nou = list() for i in range(len(span)): lista_span_nou.append(span_nou.format(span[i][0], span[i][1])) span_pattern = re.compile('<td><span class="den_articol"><a href=\".*?\" class="linkMare">.*?</a></span></td>') span = re.findall(span_pattern, articol_categorie) for i in range(len(span)): articol_categorie = articol_categorie.replace(span[i], lista_span_nou[i]) # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ==== # ==== Informatii fisier original ==== categ_link_title_pattern = re.compile('<td><span class="linkMare"><a href="(.*?)" class="linkMare"><span class="den_articol">(.*?)</span></a></span></td>') categ_link_title = re.findall(categ_link_title_pattern, articol_categorie) print("Total {} ARTICOLE".format(len(categ_link_title))) categ_date_link_title_desc_pattern = re.compile('<td class="text_dreapta">(.*?)<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>, by Neculai Fantanaru</td>') categ_date_link_title_desc = re.findall(categ_date_link_title_desc_pattern, articol_categorie) paragraf_pattern = re.compile('<p class="text_obisnuit2"><em>(.*?)</em></p>') paragraf = re.findall(paragraf_pattern, articol_categorie) # === citeste mai departe - buton === citeste_buton_pattern = re.compile('<div align="right" id="external2"><a href=\"(.*?)\">citeşte mai departe </a>') citeste_buton = re.findall(citeste_buton_pattern, articol_categorie) read_more_buton_pattern = re.compile('<div align="right" id="external2"><a href=\"(.*?)\">read more </a>') read_more_buton = re.findall(read_more_buton_pattern, articol_categorie) # === Informatii index2 === articol_categorie_index2_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->') articol_categorie_index2 = re.findall(articol_categorie_index2_pattern, text_html_model) if len(articol_categorie_index2) != 0: articol_categorie_index2 = articol_categorie_index2[0] # trebuie inlocuit cu toate categoriile din fisierul original # citire template pentru categorie din index2.html template_categorie = read_text_from_file("C:\\Folder1\\template_categorie.txt") # h3 => title + description h3_pattern = re.compile('<h3 class="font-weight-normal" itemprop="name"><a href=\"(.*?)\" class="color-black">(.*?)</a></h3>') h3 = re.findall(h3_pattern, template_categorie) h3 = h3[0] # dates section din index2.html dates_section_index2_pattern = re.compile('<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->') dates_section_index2 = re.findall(dates_section_index2_pattern, template_categorie) dates_section_index2 = dates_section_index2[0] date_index2_pattern = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>') # date date_index2 = re.findall(date_index2_pattern, dates_section_index2) date_index2 = date_index2[0] # link / title / description link_title_desc_index2_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>') link_title_desc_index2 = re.findall(link_title_desc_index2_pattern, dates_section_index2) link_title_desc_index2 = link_title_desc_index2[0] # paragraf paragraf_index2_pattern = re.compile('<p class="mb-35px color-grey line-height-25px">(.*?)</p>') paragraf_index2 = re.findall(paragraf_index2_pattern, template_categorie) paragraf_index2 = paragraf_index2[0] # === read more === read_more_pattern = re.compile('<a href=\"(.*?)\" class="btn-setting color-black btn-hvr-up btn-blue btn-hvr-pink">read more</a>') read_more = re.findall(read_more_pattern, template_categorie) read_more = read_more[0] butoane = list() if len(citeste_buton) > 0: butoane = citeste_buton else: butoane = read_more_buton for i in range(len(categ_link_title)): new_template = template_categorie # === facem replace cu informatiile din articolul original === new_template_1 = new_template.replace(date_index2, categ_date_link_title_desc[i][0].replace(', in', '').strip()) new_template_2 = new_template_1.replace(link_title_desc_index2[0], categ_date_link_title_desc[i][1]) new_template_3 = new_template_2.replace(link_title_desc_index2[1], categ_date_link_title_desc[i][2]) new_template_4 = new_template_3.replace(link_title_desc_index2[2], categ_date_link_title_desc[i][3].lstrip()) new_template_5 = new_template_4.replace(paragraf_index2, paragraf[i]) new_template_6 = new_template_5.replace(read_more, butoane[i]) new_template_7 = new_template_6.replace(h3[0], categ_link_title[i][0]) new_template_8 = new_template_7.replace(h3[1], categ_link_title[i][1]) final_text = final_text + new_template_8 + '\n' text_html_model = text_html_model.replace(articol_categorie_index2, final_text) final_text = text_html_model # schimbare CATEGORIES index2 # preluare lista fisier html lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>') lista = re.findall(lista_pattern, text_html) if len(lista) != 0: lista = lista[0] elemente_lista_pattern = re.compile('<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>') elemente_lista = re.findall(elemente_lista_pattern, lista) if elemente_lista != 0: categories_pattern = re.compile('<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->') categories = re.findall(categories_pattern, final_text) if len(categories) != 0: categories = categories[0] elemente_lista_model_pattern = re.compile('<div class="categories-name">([\s\S]*?)</div>') elemente_lista_model = re.findall(elemente_lista_model_pattern, categories) template_category = read_text_from_file('C:\\Folder1\\category-name.txt') for i in range(len(elemente_lista_model)): new_template_category = template_category a_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\">') a = re.findall(a_pattern, new_template_category)[0] p_pattern = re.compile('<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>') p = re.findall(p_pattern, new_template_category)[0] new_template_category = new_template_category.replace(a[0], elemente_lista[i][0]) new_template_category = new_template_category.replace(a[1], elemente_lista[i][1]) new_template_category = new_template_category.replace(p[0], elemente_lista[i][2]) new_template_category = new_template_category.replace(p[1], elemente_lista[i][3]) # print(final_text) final_text = final_text.replace(elemente_lista_model[i], new_template_category) else: print("No categories + books start") else: print("Niciun element <li>.") else: print("Tag <ul> gol.") # Shimbare LINK-URI FLAGS flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->') flags = re.findall(flags_pattern, text_html) if len(flags) != 0: flags = flags[0] links_pattern = re.compile('<a href=\"(.*?)\">') links = re.findall(links_pattern, flags) if len(links) != 0: # print("Links: ", links) flags_model = re.findall(flags_pattern, final_text) if len(flags_model) != 0: flags_model = flags_model[0] links_pattern_model = re.compile('<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">') links_model = re.findall(links_pattern_model, flags_model) if len(links_model) != 0: for i in range(len(links)): # print(links[i], links_model[i]) final_text = final_text.replace(links_model[i], links[i]) # FACE REPLACE else: print("Fara links in flags model") else: print("Fara links in flags model") else: print("Fara linkuri in flags.") else: print("Fara flags in articol original.") # STARS - PHP stars_php_pattern = re.compile('\$item_id = (.*?);') stars_php = re.findall(stars_php_pattern, text_html) stars_php_model = re.findall(stars_php_pattern, final_text) if len(stars_php) != 0: stars_php = stars_php[0] if len(stars_php_model) != 0: stars_php_model = stars_php_model[0] final_text = final_text.replace(stars_php_model, stars_php) # FACE REPLACE else: print("No stars fisier model") else: print("No stars fisier original") # TITLE title_pattern = re.compile('<title>(.*?)</title>') text_title = re.findall(title_pattern, text_html) text_title_model = re.findall(title_pattern, final_text) if len(text_title) != 0 and len(text_title_model) != 0: text_title = text_title[0] text_title_model = text_title_model[0] final_text = final_text.replace(text_title_model, text_title) else: print("Fisier html fara tag title: {}".format(cale_fisier_html)) # DESCRIPTION description_pattern = re.compile('<meta name="description" content="(.*?)">') text_description = re.findall(description_pattern, text_html) text_description_model = re.findall(description_pattern, final_text) if len(text_description) != 0 and len(text_description_model) != 0: text_description = text_description[0] text_description_model = text_description_model[0] final_text = final_text.replace(text_description_model, text_description) else: print("Fisier html fara tag description: {}".format(cale_fisier_html)) # CANONICAL canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />') text_canonical = re.findall(canonical_pattern, text_html) text_canonical_model = re.findall(canonical_pattern, final_text) if len(text_canonical) != 0 and len(text_canonical_model) != 0: text_canonical = text_canonical[0] text_canonical_model = text_canonical_model[0] final_text = final_text.replace(text_canonical_model, text_canonical) else: print("Fisier html fara tag canonical: {}".format(cale_fisier_html)) # ULTIMELE ARTICOLE ult_art_pattern = re.compile('<!-- Ultimele articole -->([\s\S]*?)<!-- Ultimele articole final -->') ult_art_model_pattern = re.compile('<!-- Recent Post -->([\s\S]*?)<!-- Categories -->') ult_art = re.findall(ult_art_pattern, text_html) ult_art_model = re.findall(ult_art_model_pattern, final_text) if len(ult_art) != 0: ult_art = ult_art[0] if len(ult_art_model) != 0: ult_art_model = ult_art_model[0] articole_pattern = re.compile('<li><a href=\"(.*?)\">(.*?)</a></li>') articole = re.findall(articole_pattern, ult_art) if len(articole) != 0: articole_model_pattern = re.compile('<a href=\"(.*?)\" class="color-grey">(.*?)</a>') articole_model = re.findall(articole_model_pattern, ult_art_model) if len(articole_model) != 0: for i in range(len(articole)): # href - 0 / description - 1 final_text = final_text.replace(articole_model[i][0], articole[i][0]) final_text = final_text.replace(articole_model[i][1], articole[i][1]) else: print("No articole fisier model") else: print("No articole fisier original") else: print("No lista articole fisier model") else: print("No lista articole fisier original") else: print("Nu exista articol categorie in index2.html") else: # === citire fisier model - index.html === text_html_model = read_text_from_file("C:\\Folder1\\index.html") # ARTICOL START - FINAL articol_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->[\s\S]*?') text_articol = re.findall(articol_pattern, text_html) text_articol_model = re.findall(articol_pattern, text_html_model) if len(text_articol) != 0 and len(text_articol_model) != 0: text_articol = text_articol[0] text_articol_model = text_articol_model[0] text_html_model_1 = text_html_model.replace(text_articol_model, text_articol) final_text = text_html_model_1 else: print("Fisier html fara ARTICOL START/FINAL: {}".format(cale_fisier_html)) # TITLE title_pattern = re.compile('<title>(.*?)</title>') text_title = re.findall(title_pattern, text_html) text_title_model = re.findall(title_pattern, text_html_model_1) if len(text_title) != 0 and len(text_title_model) != 0: text_title = text_title[0] text_title_model = text_title_model[0] text_html_model_2 = text_html_model_1.replace(text_title_model, text_title) final_text = text_html_model_2 else: print("Fisier html fara tag title: {}".format(cale_fisier_html)) # DESCRIPTION description_pattern = re.compile('<meta name="description" content="(.*?)">') text_description = re.findall(description_pattern, text_html) text_description_model = re.findall(description_pattern, text_html_model_2) if len(text_description) != 0 and len(text_description_model) != 0: text_description = text_description[0] text_description_model = text_description_model[0] text_html_model_3 = text_html_model_2.replace(text_description_model, text_description) final_text = text_html_model_3 else: print("Fisier html fara tag description: {}".format(cale_fisier_html)) # CANONICAL canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />') text_canonical = re.findall(canonical_pattern, text_html) text_canonical_model = re.findall(canonical_pattern, text_html_model_3) if len(text_canonical) != 0 and len(text_canonical_model) != 0: text_canonical = text_canonical[0] text_canonical_model = text_canonical_model[0] text_html_model_4 = text_html_model_3.replace(text_canonical_model, text_canonical) final_text = text_html_model_4 else: print("Fisier html fara tag canonical: {}".format(cale_fisier_html)) # remove DIV tag and TABLE tag text_articol_model = re.findall(articol_pattern, text_html_model_4) text_articol_model_old = text_articol_model[0] text_articol_model = text_articol_model[0] text_articol_model = text_articol_model.replace("<div align=\"justify\">", '') text_articol_model = text_articol_model.replace("</div>", '') table_pattern = re.compile('<table[\s\S]*?</table>') text_table = re.findall(table_pattern, text_articol_model) if len(text_table) != 0: text_table = text_table[0] text_articol_model = text_articol_model.replace(text_table, '') text_html_model_5 = text_html_model_4.replace(text_articol_model_old, text_articol_model) final_text = text_html_model_5 else: print("No text table") # schimbare tag-uri ARTICLE TITLE article_title_pattern = re.compile('<h1 class="den_articol" itemprop="name">(.*?)</h1>') article_title = re.findall(article_title_pattern, text_articol_model_old) if len(article_title) != 0: article_title = article_title[0] h3_title_pattern = re.compile('<h3 class="font-weight-normal" itemprop="name"><a href="javascript:void\(0\)" class="color-black">(.*?)</a></h3>') h3_title = re.findall(h3_title_pattern, text_html_model_5) if len(h3_title) != 0: h3_title = h3_title[0] text_html_model_6 = text_html_model_5.replace(h3_title, article_title) final_text = text_html_model_6 else: print("No h3 title.") else: print("No article title.") # schimbare DATE date_pattern = re.compile('<td class="text_dreapta">(.*?), in <a') date = re.findall(date_pattern, text_articol_model_old) if len(date) != 0: date = date[0] # MODIFICARE 09/03 date_section_pattern = re.compile('<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->') date_section = re.findall(date_section_pattern, text_html_model_6) if len(date_section) > 0: date_section = date_section[0] date_pattern_model = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>') date_model = re.findall(date_pattern_model, date_section) if len(date_model) != 0: date_model = date_model[0] text_html_model_7 = text_html_model_6.replace(date_model, date) final_text = text_html_model_7 else: print('No date in model.') else: print("No date section: <!--STARTDATES--><!--FINNISHDATES-->") else: print("No date.") # schimbare SECTION section_pattern_model = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>') section_model = re.findall(section_pattern_model, text_html_model_7) # print(section_model) if len(section_model) != 0: section_model = section_model[0] section_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>') section = re.findall(section_pattern, text_articol_model_old) if len(section) != 0: section = section[0] text_html_model_8 = text_html_model_7.replace(section_model[0], section[0]) text_html_model_9 = text_html_model_8.replace(section_model[1], section[1]) text_html_model_10 = text_html_model_9.replace(section_model[2], section[2]) final_text = text_html_model_10 else: print("No section.") else: print("No section model.") # schimbare CATEGORIES # preluare lista fisier html lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>') lista = re.findall(lista_pattern, text_html) if len(lista) != 0: lista = lista[0] elemente_lista_pattern = re.compile('<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>') elemente_lista = re.findall(elemente_lista_pattern, lista) if elemente_lista != 0: categories_pattern = re.compile('<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->') categories = re.findall(categories_pattern, text_html_model_10) if len(categories) != 0: categories = categories[0] elemente_lista_model_pattern = re.compile('<div class="categories-name">([\s\S]*?)</div>') elemente_lista_model = re.findall(elemente_lista_model_pattern, categories) template_category = read_text_from_file('C:\\Folder1\\category-name.txt') for i in range(len(elemente_lista_model)): new_template_category = template_category a_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\">') a = re.findall(a_pattern, new_template_category)[0] p_pattern = re.compile('<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>') p = re.findall(p_pattern, new_template_category)[0] new_template_category = new_template_category.replace(a[0], elemente_lista[i][0]) new_template_category = new_template_category.replace(a[1], elemente_lista[i][1]) new_template_category = new_template_category.replace(p[0], elemente_lista[i][2]) new_template_category = new_template_category.replace(p[1], elemente_lista[i][3]) # print(final_text) final_text = final_text.replace(elemente_lista_model[i], new_template_category) # print("==========================") # print(final_text) text_html_model_14 = final_text else: print("No categories + books start") else: print("Niciun element <li>.") else: print("Tag <ul> gol.") # Shimbare LINK-URI FLAGS flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->') flags = re.findall(flags_pattern, text_html) if len(flags) != 0: flags = flags[0] links_pattern = re.compile('<a href=\"(.*?)\">') links = re.findall(links_pattern, flags) if len(links) != 0: # print("Links: ", links) flags_model = re.findall(flags_pattern, text_html_model_14) if len(flags_model) != 0: flags_model = flags_model[0] # print("Flags: ", flags_model) links_pattern_model = re.compile('<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">') links_model = re.findall(links_pattern_model, flags_model) # print(links_model) text_html_model_15 = text_html_model_14 if len(links_model) != 0: for i in range(len(links)): # print(links[i], links_model[i]) text_html_model_15 = text_html_model_15.replace(links_model[i], links[i]) # FACE REPLACE final_text = text_html_model_15 else: print("Fara links in flags model") else: print("Fara links in flags model") else: print("Fara linkuri in flags.") else: print("Fara flags in articol original.") # STARS - PHP stars_php_pattern = re.compile('\$item_id = (.*?);') stars_php = re.findall(stars_php_pattern, text_html) stars_php_model = re.findall(stars_php_pattern, text_html_model_15) if len(stars_php) != 0: stars_php = stars_php[0] if len(stars_php_model) != 0: stars_php_model = stars_php_model[0] text_html_model_16 = text_html_model_15.replace(stars_php_model, stars_php) # FACE REPLACE final_text = text_html_model_16 else: print("No stars fisier model") else: print("No stars fisier original") # ULTIMELE ARTICOLE ult_art_pattern = re.compile('<!-- Ultimele articole -->([\s\S]*?)<!-- Ultimele articole final -->') ult_art_model_pattern = re.compile('<!-- Recent Post -->([\s\S]*?)<!-- Categories -->') ult_art = re.findall(ult_art_pattern, text_html) ult_art_model = re.findall(ult_art_model_pattern, text_html_model_16) if len(ult_art) != 0: ult_art = ult_art[0] if len(ult_art_model) != 0: ult_art_model = ult_art_model[0] articole_pattern = re.compile('<li><a href=\"(.*?)\">(.*?)</a></li>') articole = re.findall(articole_pattern, ult_art) if len(articole) != 0: articole_model_pattern = re.compile('<a href=\"(.*?)\" class="color-grey">(.*?)</a>') articole_model = re.findall(articole_model_pattern, ult_art_model) if len(articole_model) != 0: for i in range(len(articole)): # href - 0 / description - 1 # MODIFICARE 09/03 final_text = final_text.replace(articole_model[i][0], articole[i][0]) final_text = final_text.replace(articole_model[i][1], articole[i][1]) else: print("No articole fisier model") else: print("No articole fisier original") else: print("No lista articole fisier model") else: print("No lista articole fisier original") file_path = cale_fisiere_gata + "\\" + os.path.basename(cale_fisier_html) write_to_file(final_text, file_path) print("Scriere efectuata cu succes.") def creare_fisiere_html(cale_folder_html, cale_fisiere_gata): """ Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare """ count = 0 for f in os.listdir(cale_folder_html): if f.endswith('.html'): cale_fisier_html = cale_folder_html + "\\" + f print("FISIER CURENT: ", cale_fisier_html) copiaza_continut_html(cale_fisier_html, cale_fisiere_gata) count += 1 else: continue print("Numarul de fisiere modificate: ", count) def main(): creare_fisiere_html("C:\\Folder1\\fisiere_html", "C:\\Folder1\\fisiere_gata") if __name__ == '__main__': main()
That's all folks.
If you like my code, then make me a favor: translate your website into Romanian, "ro".
Also, you can see other Python Codes: VERSION 2 of this code. Or Version 3 OR Version 4 OR Version 5