You can view the full code here: https://pastebin.com/V1MDx0yd
Install Python.
There are several html links, all included in the section < ! -- FLAGS_1 --> to < ! -- FLAGS -->
All html files have this structure below, only the links are different. So none of the links below should be repeated in other html pages (in the FLAGS section).
And all the links start with https://neculaifantanaru.com/
<!-- FLAGS_1 --> <div class="cautareField"> <div align="right"> <a href="https://neculaifantanaru.com/stralucirea-nestematei.html"> <a href="https://neculaifantanaru.com/fr/l-eclat-de-la-gemme.html"> <a href="https://neculaifantanaru.com/en/brilliance-of-the-gem.html"> <a href="https://neculaifantanaru.com/es/gema-stargaionss.html"> <a href="https://neculaifantanaru.com/pt/brilho-da-gema.html"> <a href="https://neculaifantanaru.com/ar/my-name-is-prince.html"> <a href="https://neculaifantanaru.com/zh/books-and-magic.html"> <a href="https://neculaifantanaru.com/hi/many-things.html"> <a href="https://neculaifantanaru.com/de/horror-scenario.html"> <a href="https://neculaifantanaru.com/ru/everything-is-here.html"> </div> </div> <!-- FLAGS -->
CODUL: Find all html files (from the same folder) that have identical links in the section < ! -- FLAGS_1 --> The code will also show recurring links and html pages where they are.
import sys import re import os def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8') as f: text = f.read() return text def write_to_file(text, file_path): """ Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """ with open(file_path, 'wb') as f: f.write(text.encode('utf8', 'ignore')) def extragere_linkuri(cale_fisier_html): text_html = read_text_from_file(cale_fisier_html) flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->[\s\S]*?') text_flags = re.findall(flags_pattern, text_html) if len(text_flags) != 0: text_flags = text_flags[0] link_pattern = 'href=\"(.*?)\"' links = re.findall(link_pattern, text_flags) links = list(set(links)) return links def verificare_fisiere(cale_folder_fisiere, cale_fisier_rezultat): cai_fisiere = list() lista_linkuri = list() for f in os.listdir(cale_folder_fisiere): if f.endswith('.html'): cale_fisier_html = cale_folder_fisiere + "\\" + f links = extragere_linkuri(cale_fisier_html) cai_fisiere.append(cale_fisier_html) lista_linkuri.append(links) else: continue rezultate = '' for i in range(0, len(lista_linkuri)): for j in range(i + 1, len(lista_linkuri)): if len(set(lista_linkuri[i]).intersection(set(lista_linkuri[j]))) != 0: rezultate += "Fisiere comune: \n" print("Fisiere comune: ") for link in set(lista_linkuri[i]).intersection(set(lista_linkuri[j])): rezultate += link rezultate += '\n' print(link, '\n') rezultate += 'Fisier {} ARE LINKURI IN COMUN CU: {}'.format(cai_fisiere[i], cai_fisiere[j]) rezultate += '\n\n' print('Fisier {} ARE LINKURI IN COMUN CU: {}'.format(cai_fisiere[i], cai_fisiere[j])) print('\n\n') limba = "en" # BEBE AICI VEZI EXACT FOLDERUL, sa lasi doar "" daca vrei sa cauti in limba romana rezultate += "==========={}============\n\n".format(limba.upper()) print("==========={}============\n\n".format(limba.upper())) for i in range(0, len(lista_linkuri)): for j in range(i + 1, len(lista_linkuri)): linkuri_limba = list() if len(set(lista_linkuri[i]).intersection(set(lista_linkuri[j]))) != 0: for link in set(lista_linkuri[i]).intersection(set(lista_linkuri[j])): if limba in link.split('/'): linkuri_limba.append(link) if len(linkuri_limba) != 0: rezultate += "Fisiere comune: \n" print("Fisiere comune: ") for link in linkuri_limba: rezultate += link rezultate += '\n' print(link, '\n') rezultate += 'Fisier {} ARE LINKURI IN COMUN CU: {}'.format(cai_fisiere[i], cai_fisiere[j]) rezultate += '\n\n' print('Fisier {} ARE LINKURI IN COMUN CU: {}'.format(cai_fisiere[i], cai_fisiere[j])) print('\n\n') write_to_file(rezultate, cale_fisier_rezultat) if __name__ == "__main__": verificare_fisiere("c:\\Folder1", "c:\\Folder1\\rezultate.txt") # verificare_fisiere("e:\\Carte\\BB\\17 - Site Leadership\\Principal\\en", "c:\\Folder1\\rezultate.txt")
That's all folks.
If you like my code, then make me a favor: translate your website into Romanian, "ro".
Also, see this VERSION 2 or VERSION 3 or VERSION 4 or VERSION 5 or VERSION 6 or VERSION 7