You can view the entire code here: https://pastebin.com/hnAyWW2Q
We have many html files in which there are the following tags:
TITLE OF THE ARTICLE (will take the title of the article from the tag <h1 class="den_articol" itemprop="name"></h1>)
<td><h1 class="den_articol" itemprop="name">Abracadabra, cine esti?</h1></td>
DATE OF THE ARTICLE (will take the date from the tag : <td class="text_dreapta"></td>)
<td class="text_dreapta">On Iulie 19, 2010, in <a href="https://neculaifantanaru.com/leadership-impact.html" title="Vezi toate articolele din Leadership Impact" class="external" rel="category tag">Leadership Impact</a>, by Neculai Fantanaru</td>
BODY OF THE ARTICLE (will take all the sentences from the tags <p class="text_obisnuit"> si <p class="text_obisnuit2">
With the mention that these tags are included in the section < ! -- ARTICOL START --> si < ! -- ARTICOL FINAL --> )
< ! -- ARTICOL START -->
<p class="text_obisnuit">My name is Costel</p>
<p class="text_obisnuit2"><em>Games are my passion</em></p>
< ! -- ARTICOL FINAL -->
LINK ARTICLE (at the end of each article, the link to the html file will be displayed, preceded by https://neculaifantanaru.com/ )
abracadabra-cine-esti.html
The Python code will take the information from these tags, from all the html pages, and convert them into a single PDF file. Also, I made a dictionary to transform Unicode Characters into normal letters, in UTF-8 format.
In the same folder where you have the .py file, you will have to have another folder called "fonts" in which you will have to copy the windows fonts you work with.
CODE:
from fpdf import fpdf, html import os import re from PyPDF2 import PdfFileMerger def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8', errors='ignore') as f: text = f.read() f.close() return text file_content = re.sub('<span class="text_obisnuit2">(.*)</span>', '<b>\g<1></b>', file_content) def write_to_file(text, file_path): """ Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """ with open(file_path, 'wb') as f: f.write(text.encode('utf8', 'ignore')) f.close() dict_simboluri = dict() dict_simboluri['ă'] = 'ă' dict_simboluri['â'] = 'â' dict_simboluri['ã'] = 'ã' dict_simboluri['â'] = 'â' dict_simboluri['ă'] = 'ă' dict_simboluri['â'] = 'a' dict_simboluri[' '] = ' ' dict_simboluri['î'] = 'î' dict_simboluri['Î'] = 'Î' dict_simboluri['î'] = 'î' dict_simboluri['î'] = 'î' dict_simboluri['Î'] = 'Î' dict_simboluri['Î'] = 'Î' dict_simboluri['î'] = 'î' dict_simboluri['Î'] = 'i' dict_simboluri['Î'] = 'Î' dict_simboluri[' '] = ' ' dict_simboluri['ș'] = 'ș' dict_simboluri['Ș'] = 'Ș' dict_simboluri['Ş'] = 'Ş' dict_simboluri['ș'] = 'ș' dict_simboluri['ş'] = 'ș' dict_simboluri['&'] = '' dict_simboluri['ț'] = 'ț' dict_simboluri['ţ'] = 'ț' dict_simboluri['Ţ'] = 'Ţ' dict_simboluri['ț'] = 'ț' def save_to_pdf(directory_path): for root, dirs, files in os.walk(directory_path): for file_name in files: if file_name.endswith(".html"): file_path = root + os.sep + file_name file_content = read_text_from_file(file_path) # creare fisier PDF class PDF(fpdf.FPDF, html.HTMLMixin): pass pdf = PDF() pdf.add_page() pdf.add_font("Kanit", fname="fonts/Kanit-Regular.ttf") pdf.add_font("Kanit", style="B", fname="fonts/Kanit-Bold.ttf") pdf.add_font("Kanit", style="I", fname="fonts/Kanit-Italic.ttf") pdf.add_font("Kanit", style="BI", fname="fonts/Kanit-BoldItalic.ttf") pdf.set_font("Kanit", size=24) # extras denumire articol den_articol = re.search('<td><h1 class="den_articol" itemprop="name">(.*?)</h1></td>', file_content) if (den_articol == None): print("Nu am gasit --- denumire articol --- in fisierul --- {} ---.".format(file_path)) else: den_articol = den_articol.group(1) for simbol in dict_simboluri.keys(): den_articol = den_articol.replace(simbol, dict_simboluri[simbol]) pdf.set_text_color(204, 0, 0) # rosu pdf.set_font('Kanit', size=14, style="B") pdf.multi_cell(w=190, txt=den_articol) pdf.ln() pdf.set_font('Kanit', size=12) # extras data date = re.search('<td class="text_dreapta">(.*?), in <a', file_content) if (date == None): print("Nu am gasit --- date --- in fisierul --- {} ---.".format(file_path)) else: date = date.group(1) pdf.set_text_color(0, 102, 204) # albastru pdf.set_font('Kanit', size=8, style="B") pdf.cell(txt=date) pdf.ln() pdf.ln() pdf.ln() pdf.ln() pdf.set_text_color(0, 0, 0) # negru (default) pdf.set_font('Kanit', size=12) # extras text articol = re.search('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->', file_content) if (articol == None): print("Nu am gasit --- ARTICOL START/FINAL --- in fisierul --- {} ---.".format(file_path)) else: articol = articol.group(1) articol = articol.replace(""", "\"") articol = articol.replace("’", "'") # paragraphs par_regex = re.compile('<p class="text_obisnuit.*?">.*?</p>') pars = re.findall(par_regex, articol) pars_text = list() if (len(pars) == 0): print("Nu am gasit -- paragrafe text_obisnuit -- in fisierul --- {} ---.".format(file_path)) else: for i in range(0, len(pars)): if ('<p class="text_obisnuit">' in pars[i]): # identificam clasa text_obisnuit si preluam textul content = re.findall('<p class="text_obisnuit">(.*?)</p>', pars[i]) if (len(content) == 0): print("Nu am gasit text in paragraful {}, fisierul {}.".format(pars[i], file_path)) else: # punem textul intr-o celula multi_cell for simbol in dict_simboluri.keys(): content[0] = content[0].replace(simbol, dict_simboluri[simbol]) pars_text.append(content[0]) # pdf.multi_cell(w=190, txt = content[0]) pdf.write_html(text=f'<p class="text_obisnuit">{content[0]}</p>') # adaugam linie goala intre paragrafe pdf.ln(); elif ('<p class="text_obisnuit2">' in pars[i]): # identificam clasa text_obisnuit2 si preluam textul content = re.findall('<p class="text_obisnuit2">(.*?)</p>', pars[i]) if (len(content) == 0): print("Nu am gasit text in paragraful {}, fisierul {}.".format(pars[i], file_path)) else: # setam fontul cu bold pdf.set_font('Kanit', size=12, style="B") # punem textul intr-o celula multi_cell for simbol in dict_simboluri.keys(): content[0] = content[0].replace(simbol, dict_simboluri[simbol]) pars_text.append(content[0]) # pdf.multi_cell(w=190, txt = content[0]) pdf.write_html(text=f'<p class="text_obisnuit2"><b>{content[0]}</b></p>') # adaugam linie goala intre paragrafe pdf.ln(); # resetam fontul pdf.set_font('Kanit', size=12) else: continue # adaugare link pdf.ln() pdf.ln() pdf.set_font('Kanit', size=12, style="B") pdf.cell(txt="Source:") pdf.set_font('Kanit', size=12) pdf.set_text_color(0, 102, 204) # albastru pdf.cell(w=40, txt="https://neculaifantanaru.com/{}".format(file_name), link="https://neculaifantanaru.com/{}".format(file_name)) den_fisier = file_path.split('.')[0] + '.pdf' pdf.output(den_fisier) # break; # functie care face merge la mai multe fisiere pdf def merge_pdf_files(directory_path): merger = PdfFileMerger() for root, dirs, files in os.walk(directory_path): for file_name in files: if file_name.endswith(".pdf"): print("PDF: ", file_name) file_path = root + os.sep + file_name merger.append(file_path) merger.write(root + os.sep + "articles.pdf") merger.close() break; save_to_pdf("c:\\Folder5\\") merge_pdf_files("c:\\Folder5\\")
That's all folks.
If you like my code, then make me a favor: translate your website into Romanian, "ro".
Also, there is a VERSION 2 of this code or VERSION 3 or VERSION 4 or VERSION 5 or VERSION 6