如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）

Name: 如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.
Brand: Neculai Fantanaru
SKU: NFL
Availability: OnlineOnly
Rating: 5 (55 reviews)

On Iunie 16, 2021, in Leadership and Attitude, by Neculai Fantanaru

您可以在此处查看完整代码：HTTPS://帕萨特斌.com/我num5q公5

将使用Python代码修改的HTML页面的代码示例。将上述文本复制到.html文件，将其保存到位置C：\ folder1

   

 xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="ro">

如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）|  Neculai Fantanaru.
 rel="canonical" href="https://MY-WEBSITE.COM" />
 name="description" content="I LOVE HTML and CSS"/>

 name="keywords" content="abordarea frontala a lucrurilor neelucidate"/>
 name="abstract" content="My laptop works just fine"/>
 name="Subject" content="I think I need a new car."/>
 property="og:url" content="https://otherwebsite.com"/>
 property="og:title" content="Nobody is here?" />
 property="og:description" content="Dance is my passion."/>

下面的PowerShell代码将通过解析数据将HTML标记的内容复制到其他标记。您只需要填写标签四<meta name =“描述”...... />  <div style="background: #ffffff; overflow:auto;width:auto;border:solid gray;border-width:.1em .1em .1em .4em;padding:.2em .6em;"> <pre style="margin: 0; line-height: 125%">import requests import re # Path to english folder 1 english_folder1 = r"c:\Folder1" # Path to english folder 2 english_folder2 = r"c:\Folder1" extension_file = ".html" use_parse_folder = True #Face folder nou daca pui True, iar daca pui False redenumeste fisierele in acelasi folder import os en1_directory = os.fsencode(english_folder1) en2_directory = os.fsencode(english_folder2) print('Going through english folder') for file in os.listdir(en1_directory): filename = os.fsdecode(file) print(filename) if filename == 'y_key_e479323ce281e459.html' or filename == 'TS_4fg4_tr78.html': continue if filename.endswith(extension_file): with open(os.path.join(english_folder1, filename), encoding='utf-8') as html: html = html.read() try: with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html: en_html = en_html.read() if False: # if True: will Parse also the content that starts from  to  and so on try: comment_body = re.search('.+', html, flags=re.DOTALL)[0] en_html = re.sub('.+', comment_body, en_html, flags=re.DOTALL) except: pass try: comment_body2 = re.search('.+', html, flags=re.DOTALL)[0] en_html = re.sub('.+', comment_body2, en_html, flags=re.DOTALL) except: pass try: comment_body3 = re.search('.+', html, flags=re.DOTALL)[0] en_html = re.sub('.+', comment_body3, en_html, flags=re.DOTALL) except: pass # title to meta try: title = re.search('<title>如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.', html)[0] title_content = re.search('>(.+)<', title)[1] except: pass try: meta_og_title = re.search('', en_html)[0] new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title) en_html = en_html.replace(meta_og_title, new_meta_og_title) except: pass try: meta_keywords = re.search('', en_html)[0] new_meta_keywords = re.sub(r'content=".+"', f'content="{title_content}"', meta_keywords) en_html = en_html.replace(meta_keywords, new_meta_keywords) except: pass try: meta_abstract = re.search('', en_html)[0] new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract) en_html = en_html.replace(meta_abstract, new_meta_abstract) except: pass try: meta_Subject = re.search('', en_html)[0] new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject) en_html = en_html.replace(meta_Subject, new_meta_Subject) except: pass try: headline = re.search('"headline":.+', en_html)[0] new_headline = re.sub(r':.+', f': "{title_content}",', headline) en_html = en_html.replace(headline, new_headline) except: pass try: keywords = re.search('"keywords": "如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.", new_keywords = re.sub(r':.+', f': "{title_content}",', keywords) en_html = en_html.replace(keywords, new_keywords) except: pass # canonical to meta og:url and @id try: canonical_content = re.search('', html)[1] except: pass try: og_url = re.search('', en_html)[0] new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url) en_html = en_html.replace(og_url, new_og_url) except: pass try: id = re.search('"@id":.+', en_html)[0] new_id = re.sub(r':.+', f': "{canonical_content}"', id) en_html = en_html.replace(id, new_id) except: pass # meta description to og:description and description try: meta = re.search('] meta_description = re.search('] except: pass try: og_description = re.search('', en_html)[0] new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description) en_html = en_html.replace(og_description, new_og_description) except: pass try: description = re.search('"description": "如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.", new_description = re.sub(r':.+', f': "{meta_description}",', description) en_html = en_html.replace(description, new_description) except: pass try: en_html = re.sub(', meta, en_html) except: pass try: en_html = re.sub('如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.', title, en_html) except: pass except FileNotFoundError: continue print(f'{filename} parsed') if use_parse_folder: try: with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html: new_html.write(en_html) except: os.mkdir(english_folder2+r'\parsed') with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html: new_html.write(en_html) else: with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html: html.write(en_html)

可选的。这是一个正则表达式表达式，它将在HTML页面中更改“关键字”标记，在每个单词后添加逗号。

使用Notepad ++ - > Ctr + F - >检查：正则表达式

SEARCH: (?s)<title>.*?<\/title>.*?<meta\x20name="keywords"\x20content="\K(\w+)|\G[^\w\r\n]+(\w+)  
REPLACE BY:  ?1\l\1:,\x20\l\2

您可以尝试此复杂版本的代码，这需要更多：从标记中检索数据并将其复制到标记<meta name="keywords" content=" "/> 您可以在此处查看代码： <a href="https://pastebin.com/jM5zf2qS" target="_new">HTTPS://帕萨特斌.com/JM5政府2QS</a> <div style="background: #ffffff; overflow:auto;width:auto;border:solid gray;border-width:.1em .1em .1em .4em;padding:.2em .6em;"> <pre style="margin: 0; line-height: 125%">import requests import re # Path to english folder 1 english_folder2 = r"c:\Folder1" extension_file = ".html" use_parse_folder = True import os en1_directory = os.fsencode(english_folder2) en2_directory = os.fsencode(english_folder2) # These connection words will be ignore when parsing data from <title> tag to <meta keywords> tag LISTA_CUVINTE_LEGATURA = [ 'in', 'la', 'unei', 'si', 'sa', 'se', 'de', 'prin', 'unde', 'care', 'a', 'al', 'prea', 'lui', 'din', 'ai', 'unui', 'acei', 'un', 'doar', 'tine', 'ale', 'sau', 'dintre', 'intre', 'cu','ce', 'va', 'fi', 'este', 'cand', 'o', 'cine', 'aceasta', 'ca', 'dar', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'to', 'was', 'your', 'you', 'is', 'are', 'iar', 'fara', 'aceasta', 'pe', 'tu', 'nu', 'mai', 'ne', 'le', 'intr', 'cum', 'e', 'for', 'she', 'it', 'esti', 'this', 'that', 'how', 'can', 't', 'must', 'be', 'the', 'and', 'do', 'so', 'or', 'ori', 'who', 'what', 'if', 'of', 'on', 'i', 'we', 'they', 'them', 'but', 'where', 'by', 'an', 'on', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'made', 'make', 'my', 'me', '-', 'vom', 'voi', 'ei', 'cat', 'ar', 'putea', 'poti', 'sunteti', 'inca', 'still', 'noi', 'l', 'ma', 's', 'dupa', 'after', 'under', 'sub', 'niste', 'some', 'those', 'he' ] def creeaza_lista_keywords(titlu): # imparte titlul in 2 in functie de bara verticala | prima_parte_titlu = titlu.split('|')[0] # extrage toate cuvintele din prima parte a titlului keywords = re.findall(r'(?:\w|-*\!)+', prima_parte_titlu) # extrage keyword-urile care nu se gasesc in lista de cuvinte de legatura keywords_OK = list() for keyword in keywords: if keyword not in LISTA_CUVINTE_LEGATURA: # adauga keyword-ul cu litere mici keywords_OK.append(keyword.lower()) # returneaza un string in care toate keyword-urile sunt alaturate prin ', ' return ", ".join(keywords_OK) print('Going through english folder') amount = 1 for file in os.listdir(en1_directory): filename = os.fsdecode(file) print(filename) if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html': continue if filename.endswith(extension_file): with open(os.path.join(english_folder2, filename), encoding='utf-8') as html: html = html.read() try: with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html: en_html = en_html.read() # title to meta try: title = re.search('<title>如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.', html)[0] title_content = re.search('>(.+)<', title)[1] except: pass try: meta_og_title = re.search('', en_html)[0] new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title) en_html = en_html.replace(meta_og_title, new_meta_og_title) except: pass try: meta_keywords = re.search('', en_html)[0] keywords = creeaza_lista_keywords(title_content) new_meta_keywords = re.sub(r'content=".+"', f'content="{keywords}"', meta_keywords) en_html = en_html.replace(meta_keywords, new_meta_keywords) except: pass try: meta_abstract = re.search('', en_html)[0] new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract) en_html = en_html.replace(meta_abstract, new_meta_abstract) except: pass try: meta_Subject = re.search('', en_html)[0] new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject) en_html = en_html.replace(meta_Subject, new_meta_Subject) except: pass try: headline = re.search('"headline":.+', en_html)[0] new_headline = re.sub(r':.+', f': "{title_content}",', headline) en_html = en_html.replace(headline, new_headline) except: pass try: keywords = re.search('"keywords": "如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.", new_keywords = re.sub(r':.+', f': "{title_content}",', keywords) en_html = en_html.replace(keywords, new_keywords) except: pass # canonical to meta og:url and @id try: canonical_content = re.search('', html)[1] except: pass try: og_url = re.search('', en_html)[0] new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url) en_html = en_html.replace(og_url, new_og_url) except: pass try: id = re.search('"@id":.+', en_html)[0] new_id = re.sub(r':.+', f': "{canonical_content}"', id) en_html = en_html.replace(id, new_id) except: pass # meta description to og:description and description try: meta = re.search('] meta_description = re.search('] except: pass try: og_description = re.search('', en_html)[0] new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description) en_html = en_html.replace(og_description, new_og_description) except: pass try: description = re.search('"description": "如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.", new_description = re.sub(r':.+', f': "{meta_description}",', description) en_html = en_html.replace(description, new_description) except: pass try: en_html = re.sub(', meta, en_html) except: pass try: en_html = re.sub('如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）| Neculai Fantanaru.', title, en_html) except: pass except FileNotFoundError: continue print(f'{filename} parsed ({amount})') amount += 1 if use_parse_folder: try: with open(os.path.join(english_folder2+r'', ''+filename), 'w', encoding='utf-8') as new_html: new_html.write(en_html) except: os.mkdir(english_folder2+r'') with open(os.path.join(english_folder2+r'', ''+filename), 'w', encoding='utf-8') as new_html: new_html.write(en_html) else: with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html: html.write(en_html)

That's all folks.

If you like my code, please SHARE IT

您还可以查看代码版本电源外壳或其他python.版本3.要么版本4.要么版本5.

如何使用Python和Regex创建批处理处理器来替换HTML标记（解析）

Donate via Paypal

RECURRENT DONATION

SINGLE DONATION

Donate by Bank Transfer