Urugero rwa kode ya HTML ruzahindurwa na kode ya Python. Gukoporora inyandiko yavuzwe haruguru kuri dosiye .html, uyikize ahantuC: \ Ububiko1
xmlns="http://www.w3.org/1999/xhtml"dir="ltr"lang="ro">Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)rel="canonical"href="https://MY-WEBSITE.COM"/>name="description"content="I LOVE HTML and CSS"/>name="keywords"content="abordarea frontala a lucrurilor neelucidate"/>name="abstract"content="My laptop works just fine"/>name="Subject"content="I think I need a new car."/>property="og:url"content="https://otherwebsite.com"/>property="og:title"content="Nobody is here?"/>property="og:description"content="Dance is my passion."/>
Kode ya Powershell hepfo izakoporora ibiri muri tagi ya html kurindi tagi mugusenya amakuru. Ukeneye gusa kuzuza tagina
importrequestsimportre# Path to english folder 1
english_folder1 =r"c:\Folder1"# Path to english folder 2
english_folder2 =r"c:\Folder1"
extension_file =".html"
use_parse_folder =True#Face folder nou daca pui True, iar daca pui False redenumeste fisierele in acelasi folderimportos
en1_directory = os.fsencode(english_folder1)
en2_directory = os.fsencode(english_folder2)
print('Going through english folder')
for file in os.listdir(en1_directory):
filename = os.fsdecode(file)
print(filename)
if filename =='y_key_e479323ce281e459.html'or filename =='TS_4fg4_tr78.html':
continueif filename.endswith(extension_file):
withopen(os.path.join(english_folder1, filename), encoding='utf-8') as html:
html = html.read()
try:
withopen(os.path.join(english_folder2, filename), encoding='utf-8') as en_html:
en_html = en_html.read()
ifFalse: # if True: will Parse also the content that starts from to and so ontry:
comment_body = re.search('.+', html, flags=re.DOTALL)[0]
en_html = re.sub('.+', comment_body, en_html, flags=re.DOTALL)
except:
passtry:
comment_body2 = re.search('.+', html, flags=re.DOTALL)[0]
en_html = re.sub('.+', comment_body2, en_html, flags=re.DOTALL)
except:
passtry:
comment_body3 = re.search('.+', html, flags=re.DOTALL)[0]
en_html = re.sub('.+', comment_body3, en_html, flags=re.DOTALL)
except:
pass# title to metatry:
title = re.search('Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)', html)[0]
title_content = re.search('>(.+)<', title)[1]
except:
passtry:
meta_og_title = re.search('', en_html)[0]
new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title)
en_html = en_html.replace(meta_og_title, new_meta_og_title)
except:
passtry:
meta_keywords = re.search('', en_html)[0]
new_meta_keywords = re.sub(r'content=".+"', f'content="{title_content}"', meta_keywords)
en_html = en_html.replace(meta_keywords, new_meta_keywords)
except:
passtry:
meta_abstract = re.search('', en_html)[0]
new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract)
en_html = en_html.replace(meta_abstract, new_meta_abstract)
except:
passtry:
meta_Subject = re.search('', en_html)[0]
new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject)
en_html = en_html.replace(meta_Subject, new_meta_Subject)
except:
passtry:
headline = re.search('"headline":.+', en_html)[0]
new_headline = re.sub(r':.+', f': "{title_content}",', headline)
en_html = en_html.replace(headline, new_headline)
except:
passtry:
keywords = re.search('"keywords": "Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)",
new_keywords = re.sub(r':.+', f': "{title_content}",', keywords)
en_html = en_html.replace(keywords, new_keywords)
except:
pass# canonical to meta og:url and @idtry:
canonical_content = re.search('', html)[1]
except:
passtry:
og_url = re.search('', en_html)[0]
new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url)
en_html = en_html.replace(og_url, new_og_url)
except:
passtry:
id= re.search('"@id":.+', en_html)[0]
new_id = re.sub(r':.+', f': "{canonical_content}"', id)
en_html = en_html.replace(id, new_id)
except:
pass# meta description to og:description and descriptiontry:
meta = re.search(']
meta_description = re.search(']
except:
passtry:
og_description = re.search('', en_html)[0]
new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description)
en_html = en_html.replace(og_description, new_og_description)
except:
passtry:
description = re.search('"description": "Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase) | Neculai Fantanaru",
new_description = re.sub(r':.+', f': "{meta_description}",', description)
en_html = en_html.replace(description, new_description)
except:
passtry:
en_html = re.sub(', meta, en_html)
except:
passtry:
en_html = re.sub('Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)', title, en_html)
except:
passexcept FileNotFoundError:
continueprint(f'{filename} parsed')
if use_parse_folder:
try:
withopen(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
new_html.write(en_html)
except:
os.mkdir(english_folder2+r'\parsed')
withopen(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
new_html.write(en_html)
else:
withopen(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html:
html.write(en_html)
Bidashoboka. Hano hari imvugo ya regex izahindura "Ijambo ryibanze" kuri page ya HTML, ongeraho koma nyuma ya buri jambo.
Koresha hamwe na TIRESPad ++ -> CTR + F -> Reba: Imvugo isanzwe