您可以在此处查看完整代码:HTTPS://帕萨特斌.com/我num5q公5
将使用Python代码修改的HTML页面的代码示例。 将上述文本复制到.html文件,将其保存到位置C:\ folder1
xmlns= "http://www.w3.org/1999/xhtml" dir= "ltr" lang= "ro" >
如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru.
rel= "canonical" href= "https://MY-WEBSITE.COM" />
name= "description" content= "I LOVE HTML and CSS" />
name= "keywords" content= "abordarea frontala a lucrurilor neelucidate" />
name= "abstract" content= "My laptop works just fine" />
name= "Subject" content= "I think I need a new car." />
property= "og:url" content= "https://otherwebsite.com" />
property= "og:title" content= "Nobody is here?" />
property= "og:description" content= "Dance is my passion." />
下面的PowerShell代码将通过解析数据将HTML标记的内容复制到其他标记。 您只需要填写标签 四
import requests
import re
# Path to english folder 1
english_folder1 = r"c:\Folder1"
# Path to english folder 2
english_folder2 = r"c:\Folder1"
extension_file = ".html"
use_parse_folder = True #Face folder nou daca pui True, iar daca pui False redenumeste fisierele in acelasi folder
import os
en1_directory = os. fsencode(english_folder1)
en2_directory = os. fsencode(english_folder2)
print ('Going through english folder' )
for file in os. listdir(en1_directory):
filename = os. fsdecode(file)
print (filename)
if filename == 'y_key_e479323ce281e459.html' or filename == 'TS_4fg4_tr78.html' :
continue
if filename. endswith(extension_file):
with open (os. path. join(english_folder1, filename), encoding= 'utf-8' ) as html:
html = html. read()
try :
with open (os. path. join(english_folder2, filename), encoding= 'utf-8' ) as en_html:
en_html = en_html. read()
if False : # if True: will Parse also the content that starts from to and so on
try :
comment_body = re. search('.+' , html, flags= re. DOTALL)[0 ]
en_html = re. sub('.+' , comment_body, en_html, flags= re. DOTALL)
except :
pass
try :
comment_body2 = re. search('.+' , html, flags= re. DOTALL)[0 ]
en_html = re. sub('.+' , comment_body2, en_html, flags= re. DOTALL)
except :
pass
try :
comment_body3 = re. search('.+' , html, flags= re. DOTALL)[0 ]
en_html = re. sub('.+' , comment_body3, en_html, flags= re. DOTALL)
except :
pass
# title to meta
try :
title = re. search('如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru. ' , html)[0 ]
title_content = re. search('>(.+)<' , title)[1 ]
except :
pass
try :
meta_og_title = re. search(' ' , en_html)[0 ]
new_meta_og_title = re. sub(r'content=".+"' , f'content="{title_content}"' , meta_og_title)
en_html = en_html. replace(meta_og_title, new_meta_og_title)
except :
pass
try :
meta_keywords = re. search(' ' , en_html)[0 ]
new_meta_keywords = re. sub(r'content=".+"' , f'content="{title_content}"' , meta_keywords)
en_html = en_html. replace(meta_keywords, new_meta_keywords)
except :
pass
try :
meta_abstract = re. search(' ' , en_html)[0 ]
new_meta_abstract = re. sub(r'content=".+"' , f'content="{title_content}"' , meta_abstract)
en_html = en_html. replace(meta_abstract, new_meta_abstract)
except :
pass
try :
meta_Subject = re. search(' ' , en_html)[0 ]
new_meta_Subject = re. sub(r'content=".+"' , f'content="{title_content}"' , meta_Subject)
en_html = en_html. replace(meta_Subject, new_meta_Subject)
except :
pass
try :
headline = re. search('"headline":.+' , en_html)[0 ]
new_headline = re. sub(r':.+' , f': "{title_content}",' , headline)
en_html = en_html. replace(headline, new_headline)
except :
pass
try :
keywords = re. search('"keywords": "如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru.",
new_keywords = re. sub(r':.+' , f': "{title_content}",' , keywords)
en_html = en_html. replace(keywords, new_keywords)
except :
pass
# canonical to meta og:url and @id
try :
canonical_content = re. search(' ' , html)[1 ]
except :
pass
try :
og_url = re. search(' ' , en_html)[0 ]
new_og_url = re. sub(r'content=".+"' , f'content="{canonical_content}"' , og_url)
en_html = en_html. replace(og_url, new_og_url)
except :
pass
try :
id = re. search('"@id":.+' , en_html)[0 ]
new_id = re. sub(r':.+' , f': "{canonical_content}"' , id )
en_html = en_html. replace(id , new_id)
except :
pass
# meta description to og:description and description
try :
meta = re. search(' ]
meta_description = re. search(' ]
except :
pass
try :
og_description = re. search(' ' , en_html)[0 ]
new_og_description = re. sub(r'content=".+"' , f'content="{meta_description}"' , og_description)
en_html = en_html. replace(og_description, new_og_description)
except :
pass
try :
description = re. search('"description": "如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru.",
new_description = re. sub(r':.+' , f': "{meta_description}",' , description)
en_html = en_html. replace(description, new_description)
except :
pass
try :
en_html = re. sub(' , meta, en_html)
except :
pass
try :
en_html = re. sub('如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru. ' , title, en_html)
except :
pass
except FileNotFoundError:
continue
print (f'{filename} parsed' )
if use_parse_folder:
try :
with open (os. path. join(english_folder2+ r'\parsed' , 'parsed_' + filename), 'w' , encoding= 'utf-8' ) as new_html:
new_html. write(en_html)
except :
os. mkdir(english_folder2+ r'\parsed' )
with open (os. path. join(english_folder2+ r'\parsed' , 'parsed_' + filename), 'w' , encoding= 'utf-8' ) as new_html:
new_html. write(en_html)
else :
with open (os. path. join(english_folder2, 'parsed_' + filename), 'w' , encoding= 'utf-8' ) as html:
html. write(en_html)
可选的。 这是一个正则表达式表达式,它将在HTML页面中更改“关键字”标记,在每个单词后添加逗号。
使用Notepad ++ - > Ctr + F - >检查:正则表达式
SEARCH: (? s)< title>.*?<\/ title>.*?< meta\ x20name= "keywords" \ x20content= " \ K(\ w+)|\ G[^\ w\ r\ n]+(\ w+)
REPLACE BY: ? 1 \ l\ 1 :,\ x20\ l\ 2
您可以尝试此复杂版本的代码,这需要更多:从标记中检索数据并将其复制到标记 您可以在此处查看代码:
HTTPS://帕萨特斌.com/JM5政府2QS
import requests
import re
# Path to english folder 1
english_folder2 = r"c:\Folder1"
extension_file = ".html"
use_parse_folder = True
import os
en1_directory = os. fsencode(english_folder2)
en2_directory = os. fsencode(english_folder2)
# These connection words will be ignore when parsing data from tag to tag
LISTA_CUVINTE_LEGATURA = [
'in' , 'la' , 'unei' , 'si' , 'sa' , 'se' , 'de' , 'prin' , 'unde' , 'care' , 'a' ,
'al' , 'prea' , 'lui' , 'din' , 'ai' , 'unui' , 'acei' , 'un' , 'doar' , 'tine' ,
'ale' , 'sau' , 'dintre' , 'intre' , 'cu' ,'ce' , 'va' , 'fi' , 'este' , 'cand' , 'o' ,
'cine' , 'aceasta' , 'ca' , 'dar' , 'II' , 'III' , 'IV' , 'V' , 'VI' , 'VII' , 'VIII' ,
'to' , 'was' , 'your' , 'you' , 'is' , 'are' , 'iar' , 'fara' , 'aceasta' , 'pe' , 'tu' ,
'nu' , 'mai' , 'ne' , 'le' , 'intr' , 'cum' , 'e' , 'for' , 'she' , 'it' , 'esti' ,
'this' , 'that' , 'how' , 'can' , 't' , 'must' , 'be' , 'the' , 'and' , 'do' , 'so' , 'or' , 'ori' ,
'who' , 'what' , 'if' , 'of' , 'on' , 'i' , 'we' , 'they' , 'them' , 'but' , 'where' , 'by' , 'an' ,
'on' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9' , '0' , 'made' , 'make' , 'my' , 'me' , '-' ,
'vom' , 'voi' , 'ei' , 'cat' , 'ar' , 'putea' , 'poti' , 'sunteti' , 'inca' , 'still' , 'noi' , 'l' ,
'ma' , 's' , 'dupa' , 'after' , 'under' , 'sub' , 'niste' , 'some' , 'those' , 'he'
]
def creeaza_lista_keywords (titlu):
# imparte titlul in 2 in functie de bara verticala |
prima_parte_titlu = titlu. split('|' )[0 ]
# extrage toate cuvintele din prima parte a titlului
keywords = re. findall(r'(?:\w|-*\!)+' , prima_parte_titlu)
# extrage keyword-urile care nu se gasesc in lista de cuvinte de legatura
keywords_OK = list ()
for keyword in keywords:
if keyword not in LISTA_CUVINTE_LEGATURA:
# adauga keyword-ul cu litere mici
keywords_OK. append(keyword. lower())
# returneaza un string in care toate keyword-urile sunt alaturate prin ', '
return ", " . join(keywords_OK)
print ('Going through english folder' )
amount = 1
for file in os. listdir(en1_directory):
filename = os. fsdecode(file )
print (filename)
if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html' :
continue
if filename. endswith(extension_file):
with open (os. path. join(english_folder2, filename), encoding= 'utf-8' ) as html:
html = html. read()
try :
with open (os. path. join(english_folder2, filename), encoding= 'utf-8' ) as en_html:
en_html = en_html. read()
# title to meta
try :
title = re. search('如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru. ' , html)[0 ]
title_content = re. search('>(.+)<' , title)[1 ]
except :
pass
try :
meta_og_title = re. search(' ' , en_html)[0 ]
new_meta_og_title = re. sub(r'content=".+"' , f'content="{title_content}"' , meta_og_title)
en_html = en_html. replace(meta_og_title, new_meta_og_title)
except :
pass
try :
meta_keywords = re. search(' ' , en_html)[0 ]
keywords = creeaza_lista_keywords(title_content)
new_meta_keywords = re. sub(r'content=".+"' , f'content="{keywords}"' , meta_keywords)
en_html = en_html. replace(meta_keywords, new_meta_keywords)
except :
pass
try :
meta_abstract = re. search(' ' , en_html)[0 ]
new_meta_abstract = re. sub(r'content=".+"' , f'content="{title_content}"' , meta_abstract)
en_html = en_html. replace(meta_abstract, new_meta_abstract)
except :
pass
try :
meta_Subject = re. search(' ' , en_html)[0 ]
new_meta_Subject = re. sub(r'content=".+"' , f'content="{title_content}"' , meta_Subject)
en_html = en_html. replace(meta_Subject, new_meta_Subject)
except :
pass
try :
headline = re. search('"headline":.+' , en_html)[0 ]
new_headline = re. sub(r':.+' , f': "{title_content}",' , headline)
en_html = en_html. replace(headline, new_headline)
except :
pass
try :
keywords = re. search('"keywords": "如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru.",
new_keywords = re. sub(r':.+' , f': "{title_content}",' , keywords)
en_html = en_html. replace(keywords, new_keywords)
except :
pass
# canonical to meta og:url and @id
try :
canonical_content = re. search(' ' , html)[1 ]
except :
pass
try :
og_url = re. search(' ' , en_html)[0 ]
new_og_url = re. sub(r'content=".+"' , f'content="{canonical_content}"' , og_url)
en_html = en_html. replace(og_url, new_og_url)
except :
pass
try :
id = re. search('"@id":.+' , en_html)[0 ]
new_id = re. sub(r':.+' , f': "{canonical_content}"' , id )
en_html = en_html. replace(id , new_id)
except :
pass
# meta description to og:description and description
try :
meta = re. search(' ]
meta_description = re. search(' ]
except :
pass
try :
og_description = re. search(' ' , en_html)[0 ]
new_og_description = re. sub(r'content=".+"' , f'content="{meta_description}"' , og_description)
en_html = en_html. replace(og_description, new_og_description)
except :
pass
try :
description = re. search('"description": "如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru.",
new_description = re. sub(r':.+' , f': "{meta_description}",' , description)
en_html = en_html. replace(description, new_description)
except :
pass
try :
en_html = re. sub(' , meta, en_html)
except :
pass
try :
en_html = re. sub('如何使用Python和Regex创建批处理处理器来替换HTML标记(解析)| Neculai Fantanaru. ' , title, en_html)
except :
pass
except FileNotFoundError:
continue
print (f'{filename} parsed ({amount})' )
amount += 1
if use_parse_folder:
try :
with open (os. path. join(english_folder2+ r'' , '' + filename), 'w' , encoding= 'utf-8' ) as new_html:
new_html. write(en_html)
except :
os. mkdir(english_folder2+ r'' )
with open (os. path. join(english_folder2+ r'' , '' + filename), 'w' , encoding= 'utf-8' ) as new_html:
new_html. write(en_html)
else :
with open (os. path. join(english_folder2, 'parsed_' + filename), 'w' , encoding= 'utf-8' ) as html:
html. write(en_html)
That's all folks.
If you like my code, please SHARE IT
您还可以查看代码版本电源外壳 或其他python.版本3. 要么版本4. 要么版本5.
Latest articles accessed by readers:
An Eye To See And A Mind To Understand
Turn Towards Me With An Eye Full Of Your Own Gaze
The Snapshot Of Magic In God's Universe
Rhythm Of My Heart
* Note: If you want to read all my articles in real time, please check the romanian version !