<title>Blah Blah Blahtitle><meta name="description" content="Blah Blah Blah."><h3 class="font-weight-normal">TITLE OF THE ARTICLEh3><p>Lorem ipsum dolor sit amet, consectetur adipiscing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris
nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in
reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla
pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.p>
4进主文件夹您需要将所有文本文件和OANA.html文件放入
WHAT DOES THE CODE DO?:
1.从每个文本文件中检索前10个单词,将该文件另存为10个单词的HTML链接。
2.从每个文本文件中检索前10个单词,然后将它们复制到
标记和
标记
3.从每个文本文件中检索前20个单词,然后将它们复制到标记。
4.将文本文件的整个内容复制到该部分
#-------------------------------------------------------------------------------# Name: Create html files from text files# Purpose:## Author: Neculai Fantanaru## Created: 22/01/2022# Copyright: (c) Neculai Fantanaru 2022#-------------------------------------------------------------------------------importosimportreimportrandomimportunidecodeimportnltkfromnltkimport tokenize
# nltk.download('punkt')
SITE ='https://neculaifantanaru.com/'
LISTA_CUVINTE_LEGATURA = [
'in', 'la', 'unei', 'si', 'sa', 'se', 'de', 'prin', 'unde', 'care', 'a',
'al', 'prea', 'lui', 'din', 'ai', 'unui', 'acei', 'un', 'doar', 'tine',
'ale', 'sau', 'dintre', 'intre', 'cu', 'ce', 'va', 'fi', 'este', 'cand', 'o',
'cine', 'aceasta', 'ca', 'dar', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII',
'to', 'was', 'your', 'you', 'is', 'are', 'iar', 'fara', 'asta', 'pe', 'tu',
'nu', 'mai', 'ne', 'le', 'intr', 'cum', 'e', 'for', 'she', 'it', 'esti',
'this', 'that', 'how', 'can', 't', 'must', 'be', 'the', 'and', 'do', 'so', 'or', 'ori',
'who', 'what', 'if', 'of', 'on', 'i', 'we', 'they', 'them', 'but', 'where', 'by', 'an',
'mi', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'made', 'my', 'me', '-',
'vom', 'voi', 'ei', 'cat', 'ar', 'putea', 'poti', 'sunteti', 'inca', 'still', 'noi', 'l',
'ma', 's', 'dupa', 'after', 'under', 'sub', 'niste', 'some', 'those', 'he', 'no', 'too',
'fac', 'made', 'make', 'cei', 'most', 'face', 'pentru', 'cat', 'cate', 'much', 'more', 'many',
'sale', 'tale', 'tau', 'has', 'sunt', 'his', 'yours', 'only', 'as', 'toate', 'all', 'tot', 'incat',
'which', 'ti', 'asa', 'like', 'these', 'because', 'unor', 'caci', 'ele', 'have', 'haven', 'te',
'cea', 'else', 'imi', 'iti', 'should', 'could', 'not', 'even', 'chiar', 'when', 'ci', 'ne', 'ni',
'her', 'our', 'alta', 'another', 'other', 'decat', 'acelasi', 'same', 'au', 'had', 'haven', 'hasn',
'alte', 'alt', 'others', 'ceea', 'cel', 'cele', 'alte', 'despre', 'about', 'acele', 'acel', 'acea',
'decit', 'with', '_', 'fata', 'towards', 'against', 'cind', 'dinspre', 'fost', 'been', 'era'
]
PATTERN_LINK ="\"{}\" target=\"_new\">{}"'''structura dictionar cuvinte{ "cuvantul1": [lista_linkuri1], "cuvantul2": [lista_linkuri2]}'''
CALE_FISIER_LINKURI ="C:\\Folder1\\LINKS\\links.txt"# folosim DEF cand vrem sa definim o functie => un cuvant cheie in Python# REGULA: def nume_functie(lista_argumente)defpreia_cuvinte_link(link):
cuvinte = link.split('.')[0] # [0] ia primul element iar daca pun [1] ia al doilea element
cuvinte = cuvinte.split('-')
cuvinte_ok =list()
for cuv in cuvinte:
if cuv notin LISTA_CUVINTE_LEGATURA:
cuvinte_ok.append(cuv)
return cuvinte_ok # am pus retutn fiindca voi avea nevoie de rezultatul functiei de mai susdefpreia_cuvinte_lista_linkuri(cale_fisier_linkuri):
lista_cuvinte_linkuri =list()
dictionar_cuvinte_linkuri =dict()
withopen(cale_fisier_linkuri, encoding='utf8') as fp:
lines = fp.readlines()
for line in lines:
# functia preia_cuvinte_link returneaza un rezultat care este salvat in variabila cuvinte_link
cuvinte_link = preia_cuvinte_link(line)
for cuv in cuvinte_link:
if cuv in dictionar_cuvinte_linkuri.keys():
ifnot SITE + line.strip() in dictionar_cuvinte_linkuri[cuv]:
dictionar_cuvinte_linkuri[cuv].append(SITE + line.strip())
else:
dictionar_cuvinte_linkuri[cuv] = [SITE + line.strip()]
lista_cuvinte_linkuri.extend(cuvinte_link)
lista_cuvinte_linkuri =list(set(lista_cuvinte_linkuri))
return lista_cuvinte_linkuri, dictionar_cuvinte_linkuri
defciteste_fisier_linie_cu_linie(cale_fisier):
withopen(cale_fisier, encoding='utf8') as fp:
lines = fp.readlines()
count =0for line in lines:
print(count, line.strip())
count +=1defread_text_from_file(file_path):
""" Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """withopen(file_path, encoding='utf8') as f:
text = f.read()
return text
defwrite_to_file(text, file_path):
""" Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """withopen(file_path, 'wb') as f:
f.write(text.encode('utf8', 'ignore'))
defsplit_propozitii(text):
# 01.02.2022: folosit librarie pentru extragerea propozitiilor
propozitii = tokenize.sent_tokenize(text)
# 01.02.2022: scoatem spatiile in plus de la inceputul/finalul propozitiilor si facem prima litera mare
propozitii = [prop.strip().capitalize() for prop in propozitii]
# 01.02.2022: scot spatiile in plus de la final de propozitie. De exemplu: "ana are mere ?" => "ana are mere?"
propozitii = [prop[:-1].strip() + prop[-1] for prop in propozitii]
# 31.01.2022: modificat tag-ul p si adaugat css (4)
tag ="
\"mb-40px\">{}
"
text_start_final =""# print(len(propozitii))
numar_propozitii_grup =7
numar_grupuri =int(len(propozitii) / numar_propozitii_grup)
start =0
LINK_INTRODUS =0for numar_grup inrange(numar_grupuri):
# print("Iteratia: ", numar_grup)
lista_cuvinte_gasite =list()
if numar_grup !=0and numar_grup != numar_grupuri -1:
# 31.01.2022: fixat bug (1)
text_tag =" ".join(propozitii[start:(start + numar_propozitii_grup)])
if LINK_INTRODUS ==0:
cuvinte = re.findall(r' (?:\w|-*\!)+[ ,]', text_tag)
cuvinte_linkuri, dictionar_linkuri = preia_cuvinte_lista_linkuri(CALE_FISIER_LINKURI)
for cuv in cuvinte:
cuv_fara_semne = cuv.replace(' ', '')
cuv_fara_semne = cuv_fara_semne.replace(',', '')
if cuv_fara_semne in dictionar_linkuri.keys():
lista_cuvinte_gasite.append(cuv)
lista_cuvinte_gasite =list(set(lista_cuvinte_gasite))
cuvant_random = random.sample(lista_cuvinte_gasite, 1)[0]
cuvant_random_fara_semne = cuvant_random.replace(' ', '')
cuvant_random_fara_semne = cuvant_random_fara_semne.replace(',', '')
link_random = random.sample(dictionar_linkuri[cuvant_random_fara_semne], 1)[0]
# singur cuvant subliniat
pattern = PATTERN_LINK.format(link_random, cuvant_random.strip())
text_tag = text_tag.replace(cuvant_random.strip(), pattern, 1)
LINK_INTRODUS =1# doua cuvinte subliniate''' expresie_regulata = cuvant_random.strip() + r' *\w+' urmatorul_cuvant = re.findall(expresie_regulata, text_tag)[0] pattern = PATTERN_LINK.format(link_random, urmatorul_cuvant) text_tag = text_tag.replace(urmatorul_cuvant, pattern, 1) LINK_INTRODUS = 1 '''
text_tag = tag.format(text_tag)
text_start_final = text_start_final +'\n'+ text_tag
start = start + numar_propozitii_grup
else:
# 31.01.2022: fixat bug (1)
text_tag =" ".join(propozitii[start:(start + numar_propozitii_grup)])
text_tag = tag.format(text_tag)
text_start_final = text_start_final +'\n'+ text_tag
start = start + numar_propozitii_grup
text_tag =" ".join(propozitii[start:len(propozitii)])
text_tag = tag.format(text_tag)
text_start_final = text_start_final +'\n'+ text_tag
# print(text_start_final)# 31.01.2022: Verificat, paragrafele se afiseaza frumos unul sub altul (5)return text_start_final
defcopiaza_continut_txt_html(cale_fisier_txt, cale_fisier_html): # astea sunt argumentele functiei, adica cand apelez functia# citesti textul din fisier
text_txt = read_text_from_file(cale_fisier_txt)
# split dupa '\n'
lines = text_txt.splitlines()
ok_lines =list()
for line in lines:
if line ==''or line =='\ufeff':
continueelse:
ok_lines.append(line)
# 02.02.2022: titlul e format din primele 10 cuvinte din text# title_words = re.findall(r'(?:\w|-*\!)+', ok_lines[0])
title_words = re.findall(r'(?:\w|-*\!)+', ok_lines[0])[:10]
description_words = re.findall(r'(?:\w|-*\!)+', ok_lines[0])
description_words =u' '.join(description_words[:20])
# print("title: ", title_words)# print("description: ", description_words)
text_html = read_text_from_file(cale_fisier_html)
# aici e pattern-ul pentru expresia regex; (.*?) inseamna ca preia tot ce este intre tag-uri# modifici expresia regulata in functie de ce tag dai ca argument pentru functie
articol_pattern = re.compile('([\s\S]*?)[\s\S]*?')
text_articol = re.findall(articol_pattern, text_html)
iflen(text_articol) !=0:
text_articol = text_articol[0]
text_txt = split_propozitii(text_txt)
text_txt ='\n\n'+ text_txt +'\n\n'
text_html = text_html.replace(text_articol, text_txt)
else:
print("Fisier html fara ARTICOL START/FINAL.")
title_pattern = re.compile('python:从文本文件和标签优化创建多个HTML文件| Neculai Fantanaru(en)')
text_title = re.findall(title_pattern, text_html)
# 01.02.2022: inlocuire h3 cu text titlu (2)
h3_pattern = re.compile('
')
text_h3 = re.findall(h3_pattern, text_html)
iflen(text_title) !=0:
text_title = text_title[0]
# inlocuire semne
expresii_regex = [r'\.', r'\,', r'\?', r'\!', r'\:', r'\;', r'\"']
for exp_reg in expresii_regex:
title_words = [re.sub(exp_reg, '-', word) for word in title_words]
# creare nume nou link
new_filename =u'-'.join(title_words).lower()
new_file_name_fara_spatiu = unidecode.unidecode(new_filename)
new_file_name_fara_spatiu = new_file_name_fara_spatiu +'.html'# inlocuire text titlu cu primele 10 cuvinte
text_html = text_html.replace(text_title, u' '.join(title_words))
# 01.02.2022: inlocuire h3 cu text titlu (2)iflen(text_h3) !=0:
text_h3 = text_h3[0]
text_html = text_html.replace(text_h3, u' '.join(title_words))
else:
print("Fisierul nu are tag-ul h3.")
# 07.02.2022: inlocuire text canonical tag
canonical_tag_pattern = re.compile('')
canonical_tag = re.findall(canonical_tag_pattern, text_html)
iflen(canonical_tag) !=0:
canonical_tag = canonical_tag[0]
#text_html = text_html.replace(canonical_tag, new_file_name_fara_spatiu)# daca trebuie sa pui si "https://neculaifantanaru.com/" in fata, comentezi linia de mai sus si o decomentezi pe cea de jos
text_html = text_html.replace(canonical_tag, "https://trinketbox.ro/"+ new_file_name_fara_spatiu)
else:
print("Fisier fara tag canonical")
else:
print("Fisier html fara titlu.")
description_pattern = re.compile(')
text_description = re.findall(description_pattern, text_html)
iflen(text_description) !=0:
text_description = text_description[0]
# print("text description: ", text_description)
text_html = text_html.replace(text_description, description_words)
else:
print("Fisier html fara description.")
file_path = os.path.dirname(cale_fisier_txt) +"\\"+"fisiere_html"+"\\"+ new_file_name_fara_spatiu
write_to_file(text_html, file_path)
# print("Fisier: ", new_file_name_fara_spatiu)print("Scriere efectuata cu succes.")
defcreare_fisiere_html(cale_folder_txt, cale_fisier_html):
""" Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare """
count =0for f in os.listdir(cale_folder_txt):
if f.endswith('.txt'):
cale_fisier_txt = cale_folder_txt +"\\"+ f
copiaza_continut_txt_html(cale_fisier_txt, cale_fisier_html)
count +=1else:
continueprint("Numarul de fisiere modificate: ", count)
defmain():
creare_fisiere_html("C:\\Folder1", "C:\\Folder1\\index.html")
# lista_cuvinte, dictionar_cuvinte = preia_cuvinte_lista_linkuri(CALE_FISIER_LINKURI)# print(len(lista_cuvinte)) # len - arata dmensiunea# print(dictionar_cuvinte)if __name__ =='__main__':
main()