Puteti vizualiza intregul cod aici: https://pastebin.com/xT2C4PHP
Avem multe fisiere html in care exista urmatoarele taguri:
TITLUL ARTICOLUI (va prelua titlul articolului din tagul <h1 class="den_articol" itemprop="name"></h1>)
<td><h1 class="den_articol" itemprop="name">Abracadabra, cine esti?</h1></td>
DATA ARTICOLULUI (va prelua data din tagul : <td class="text_dreapta"></td>)
<td class="text_dreapta">On Iulie 19, 2010, in <a href="https://neculaifantanaru.com/leadership-impact.html" title="Vezi toate articolele din Leadership Impact" class="external" rel="category tag">Leadership Impact</a>, by Neculai Fantanaru</td>
CORPUL ARTICOLULUI (va prelua toate propozitiile din tagurile <p class="text_obisnuit"> si <p class="text_obisnuit2">
Cu mentiunea ca aceste taguri sunt incadrate in sectiunea < ! -- ARTICOL START --> si < ! -- ARTICOL FINAL --> )
< ! -- ARTICOL START -->
<p class="text_obisnuit">My name is Costel</p>
<p class="text_obisnuit2"><em>Games are my passion</em></p>
< ! -- ARTICOL FINAL -->
LINK ARTICOL (la sfarsitul fiecarui articol va fi afisat link-ul fisierului html, precedat de https://neculaifantanaru.com/ )
abracadabra-cine-esti.html
Codul Python va prelua informatiile din aceste taguri, din toate paginile html, si le va converti intr-un singur fisier PDF. De asemenea, am facut un dictionar care sa transforme Unicode Characters in litere normale, in format UTF-8.
In acelasi folder in care aveti fisierul .py, va trebui sa aveti un alt folder numit "fonts" in care va trebui sa copiati fonturile windows cu care lucrati.
CODE:
from fpdf import fpdf, html import os import re from PyPDF2 import PdfFileMerger def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8', errors='ignore') as f: text = f.read() f.close() return text file_content = re.sub('<span class="text_obisnuit2">(.*)</span>', '<b>\g<1></b>', file_content) def write_to_file(text, file_path): """ Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """ with open(file_path, 'wb') as f: f.write(text.encode('utf8', 'ignore')) f.close() dict_simboluri = dict() dict_simboluri['ă'] = 'ă' dict_simboluri['â'] = 'â' dict_simboluri['ã'] = 'ã' dict_simboluri['â'] = 'â' dict_simboluri['ă'] = 'ă' dict_simboluri['â'] = 'a' dict_simboluri[' '] = ' ' dict_simboluri['î'] = 'î' dict_simboluri['Î'] = 'Î' dict_simboluri['î'] = 'î' dict_simboluri['î'] = 'î' dict_simboluri['Î'] = 'Î' dict_simboluri['Î'] = 'Î' dict_simboluri['î'] = 'î' dict_simboluri['Î'] = 'i' dict_simboluri['Î'] = 'Î' dict_simboluri[' '] = ' ' dict_simboluri['ș'] = 'ș' dict_simboluri['Ș'] = 'Ș' dict_simboluri['Ş'] = 'Ş' dict_simboluri['ș'] = 'ș' dict_simboluri['ş'] = 'ș' dict_simboluri['&'] = '' dict_simboluri['ț'] = 'ț' dict_simboluri['ţ'] = 'ț' dict_simboluri['Ţ'] = 'Ţ' dict_simboluri['ț'] = 'ț' def save_to_pdf(directory_path): for root, dirs, files in os.walk(directory_path): for file_name in files: if file_name.endswith(".html"): file_path = root + os.sep + file_name file_content = read_text_from_file(file_path) # creare fisier PDF class PDF(fpdf.FPDF, html.HTMLMixin): pass pdf = PDF() pdf.add_page() pdf.add_font("Kanit", fname="fonts/Kanit-Regular.ttf") pdf.add_font("Kanit", style="B", fname="fonts/Kanit-Bold.ttf") pdf.add_font("Kanit", style="I", fname="fonts/Kanit-Italic.ttf") pdf.add_font("Kanit", style="BI", fname="fonts/Kanit-BoldItalic.ttf") pdf.set_font("Kanit", size=24) # extras denumire articol den_articol = re.search('<td><h1 class="den_articol" itemprop="name">(.*?)</h1></td>', file_content) if (den_articol == None): print("Nu am gasit --- denumire articol --- in fisierul --- {} ---.".format(file_path)) else: den_articol = den_articol.group(1) for simbol in dict_simboluri.keys(): den_articol = den_articol.replace(simbol, dict_simboluri[simbol]) pdf.set_text_color(204, 0, 0) # rosu pdf.set_font('Kanit', size=14, style="B") pdf.multi_cell(w=190, txt=den_articol) pdf.ln() pdf.set_font('Kanit', size=12) # extras data date = re.search('<td class="text_dreapta">(.*?), in <a', file_content) if (date == None): print("Nu am gasit --- date --- in fisierul --- {} ---.".format(file_path)) else: date = date.group(1) pdf.set_text_color(0, 102, 204) # albastru pdf.set_font('Kanit', size=8, style="B") pdf.cell(txt=date) pdf.ln() pdf.ln() pdf.ln() pdf.ln() pdf.set_text_color(0, 0, 0) # negru (default) pdf.set_font('Kanit', size=12) # extras text articol = re.search('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->', file_content) if (articol == None): print("Nu am gasit --- ARTICOL START/FINAL --- in fisierul --- {} ---.".format(file_path)) else: articol = articol.group(1) articol = articol.replace(""", "\"") articol = articol.replace("’", "'") # paragraphs par_regex = re.compile('<p class="text_obisnuit.*?">.*?</p>') pars = re.findall(par_regex, articol) pars_text = list() if (len(pars) == 0): print("Nu am gasit -- paragrafe text_obisnuit -- in fisierul --- {} ---.".format(file_path)) else: for i in range(0, len(pars)): if ('<p class="text_obisnuit">' in pars[i]): # identificam clasa text_obisnuit si preluam textul content = re.findall('<p class="text_obisnuit">(.*?)</p>', pars[i]) if (len(content) == 0): print("Nu am gasit text in paragraful {}, fisierul {}.".format(pars[i], file_path)) else: # punem textul intr-o celula multi_cell for simbol in dict_simboluri.keys(): content[0] = content[0].replace(simbol, dict_simboluri[simbol]) pars_text.append(content[0]) # pdf.multi_cell(w=190, txt = content[0]) pdf.write_html(text=f'<p class="text_obisnuit">{content[0]}</p>') # adaugam linie goala intre paragrafe pdf.ln(); elif ('<p class="text_obisnuit2">' in pars[i]): # identificam clasa text_obisnuit2 si preluam textul content = re.findall('<p class="text_obisnuit2">(.*?)</p>', pars[i]) if (len(content) == 0): print("Nu am gasit text in paragraful {}, fisierul {}.".format(pars[i], file_path)) else: # setam fontul cu bold pdf.set_font('Kanit', size=12, style="B") # punem textul intr-o celula multi_cell for simbol in dict_simboluri.keys(): content[0] = content[0].replace(simbol, dict_simboluri[simbol]) pars_text.append(content[0]) # pdf.multi_cell(w=190, txt = content[0]) pdf.write_html(text=f'<p class="text_obisnuit2"><b>{content[0]}</b></p>') # adaugam linie goala intre paragrafe pdf.ln(); # resetam fontul pdf.set_font('Kanit', size=12) else: continue # adaugare link pdf.ln() pdf.ln() pdf.set_font('Kanit', size=12, style="B") pdf.cell(txt="Source:") pdf.set_font('Kanit', size=12) pdf.set_text_color(0, 102, 204) # albastru pdf.cell(w=40, txt="https://neculaifantanaru.com/{}".format(file_name), link="https://neculaifantanaru.com/{}".format(file_name)) den_fisier = file_path.split('.')[0] + '.pdf' pdf.output(den_fisier) # break; # functie care face merge la mai multe fisiere pdf def merge_pdf_files(directory_path): merger = PdfFileMerger() for root, dirs, files in os.walk(directory_path): for file_name in files: if file_name.endswith(".pdf"): print("PDF: ", file_name) file_path = root + os.sep + file_name merger.append(file_path) merger.write(root + os.sep + "articles.pdf") merger.close() break; save_to_pdf("c:\\Folder5\\") merge_pdf_files("c:\\Folder5\\")
That's all folks.
If you like my code, then make me a favor: translate your website into Romanian, "ro".
Also, there is a VERSION 2 of this code or VERSION 2 or VERSION 3 or VERSION 4 or VERSION 4