You can view the full code here: https://pastebin.com/wnuM5Qg5
A code example of html pages that will be modified with Python code. Copy the above text to an .html file, save it to the location C:\Folder1
<!DOCTYPE html> <html xmlns="https://www.w3.org/1999/xhtml" dir="ltr" lang="ro"> <head> <title>YOUR FIRST PAGE</title> <link rel="canonical" href="https://MY-WEBSITE.COM" /> <meta name="description" content="I LOVE HTML and CSS"/> <meta name="keywords" content="abordarea frontala a lucrurilor neelucidate"/> <meta name="abstract" content="My laptop works just fine"/> <meta name="Subject" content="I think I need a new car."/> <meta property="og:url" content="https://otherwebsite.com"/> <meta property="og:title" content="Nobody is here?" /> <meta property="og:description" content="Dance is my passion."/> <!-- Schema Org Start --> <script type="application/ld+json"> { "@context":"https://schema.org", "@type":"Article", "mainEntityOfPage": { "@type": "WebPage", "@id": "https://books-and-reading.com" }, "headline": "Another glass", "keywords": "anything, words", "description": "My name is Prince.", "image": { "@type": "ImageObject", "url": "https://website.com/icon-facebook.jpg" } } </script>
The PowerShell code below will copy the contents of the html tags to the other tags by parsing the data. You only need to fill in the tags <title> si <meta name="description"... />
import requests import re # Path to english folder 1 english_folder1 = r"c:\Folder1" # Path to english folder 2 english_folder2 = r"c:\Folder1" extension_file = ".html" use_parse_folder = True #Face folder nou daca pui True, iar daca pui False redenumeste fisierele in acelasi folder import os en1_directory = os.fsencode(english_folder1) en2_directory = os.fsencode(english_folder2) print('Going through english folder') for file in os.listdir(en1_directory): filename = os.fsdecode(file) print(filename) if filename == 'y_key_e479323ce281e459.html' or filename == 'TS_4fg4_tr78.html': continue if filename.endswith(extension_file): with open(os.path.join(english_folder1, filename), encoding='utf-8') as html: html = html.read() try: with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html: en_html = en_html.read() if False: # if True: will Parse also the content that starts from <!-- ARTICOL START --> to <!-- ARTICOL FINAL --> and so on try: comment_body = re.search('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', html, flags=re.DOTALL)[0] en_html = re.sub('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', comment_body, en_html, flags=re.DOTALL) except: pass try: comment_body2 = re.search('<!-- FLAGS_1 -->.+<!-- FLAGS -->', html, flags=re.DOTALL)[0] en_html = re.sub('<!-- FLAGS_1 -->.+<!-- FLAGS -->', comment_body2, en_html, flags=re.DOTALL) except: pass try: comment_body3 = re.search('<!-- MENIU BARA SUS -->.+<!-- SFARSIT MENIU BARA SUS -->', html, flags=re.DOTALL)[0] en_html = re.sub('<!-- MENIU BARA SUS -->.+<!-- SFARSIT MENIU BARA SUS -->', comment_body3, en_html, flags=re.DOTALL) except: pass # title to meta try: title = re.search('<title.+/title>', html)[0] title_content = re.search('>(.+)<', title)[1] except: pass try: meta_og_title = re.search('<meta property="og:title".*>', en_html)[0] new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title) en_html = en_html.replace(meta_og_title, new_meta_og_title) except: pass try: meta_keywords = re.search('<meta name="keywords".*>', en_html)[0] new_meta_keywords = re.sub(r'content=".+"', f'content="{title_content}"', meta_keywords) en_html = en_html.replace(meta_keywords, new_meta_keywords) except: pass try: meta_abstract = re.search('<meta name="abstract".*>', en_html)[0] new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract) en_html = en_html.replace(meta_abstract, new_meta_abstract) except: pass try: meta_Subject = re.search('<meta name="Subject".*>', en_html)[0] new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject) en_html = en_html.replace(meta_Subject, new_meta_Subject) except: pass try: headline = re.search('"headline":.+', en_html)[0] new_headline = re.sub(r':.+', f': "{title_content}",', headline) en_html = en_html.replace(headline, new_headline) except: pass try: keywords = re.search('"keywords":.+', en_html)[0] new_keywords = re.sub(r':.+', f': "{title_content}",', keywords) en_html = en_html.replace(keywords, new_keywords) except: pass # canonical to meta og:url and @id try: canonical_content = re.search('<link rel="canonical" href="(.+)".*>', html)[1] except: pass try: og_url = re.search('<meta property="og:url".*>', en_html)[0] new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url) en_html = en_html.replace(og_url, new_og_url) except: pass try: id = re.search('"@id":.+', en_html)[0] new_id = re.sub(r':.+', f': "{canonical_content}"', id) en_html = en_html.replace(id, new_id) except: pass # meta description to og:description and description try: meta = re.search('<meta name="description".+/>', html)[0] meta_description = re.search('<meta name="description" content="(.+)".+>', html)[1] except: pass try: og_description = re.search('<meta property="og:description".+/>', en_html)[0] new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description) en_html = en_html.replace(og_description, new_og_description) except: pass try: description = re.search('"description":.+', en_html)[0] new_description = re.sub(r':.+', f': "{meta_description}",', description) en_html = en_html.replace(description, new_description) except: pass try: en_html = re.sub('<meta name="description".+/>', meta, en_html) except: pass try: en_html = re.sub('<title.+/title>', title, en_html) except: pass except FileNotFoundError: continue print(f'{filename} parsed') if use_parse_folder: try: with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html: new_html.write(en_html) except: os.mkdir(english_folder2+r'\parsed') with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html: new_html.write(en_html) else: with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html: html.write(en_html)
Optional. Here is a REGEX expression that will change the "KEYWORDS" tag in the html page, adding a comma after each word.
Use with Notepad++ -> Ctr+F -> Check: Regular Expression
SEARCH: (?s)<title>.*?<\/title>.*?<meta\x20name="keywords"\x20content="\K(\w+)|\G[^\w\r\n]+(\w+) REPLACE BY: ?1\l\1:,\x20\l\2
You can try this complex version of code, which does more: retrieve the data from the <title> tag and copy it to the tag <meta name="keywords" content=" "/> You can see the code here:
import requests import re # Path to english folder 1 english_folder2 = r"c:\Folder1" extension_file = ".html" use_parse_folder = True import os en1_directory = os.fsencode(english_folder2) en2_directory = os.fsencode(english_folder2) # These connection words will be ignore when parsing data from <title> tag to <meta keywords> tag LISTA_CUVINTE_LEGATURA = [ 'in', 'la', 'unei', 'si', 'sa', 'se', 'de', 'prin', 'unde', 'care', 'a', 'al', 'prea', 'lui', 'din', 'ai', 'unui', 'acei', 'un', 'doar', 'tine', 'ale', 'sau', 'dintre', 'intre', 'cu','ce', 'va', 'fi', 'este', 'cand', 'o', 'cine', 'aceasta', 'ca', 'dar', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'to', 'was', 'your', 'you', 'is', 'are', 'iar', 'fara', 'aceasta', 'pe', 'tu', 'nu', 'mai', 'ne', 'le', 'intr', 'cum', 'e', 'for', 'she', 'it', 'esti', 'this', 'that', 'how', 'can', 't', 'must', 'be', 'the', 'and', 'do', 'so', 'or', 'ori', 'who', 'what', 'if', 'of', 'on', 'i', 'we', 'they', 'them', 'but', 'where', 'by', 'an', 'on', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'made', 'make', 'my', 'me', '-', 'vom', 'voi', 'ei', 'cat', 'ar', 'putea', 'poti', 'sunteti', 'inca', 'still', 'noi', 'l', 'ma', 's', 'dupa', 'after', 'under', 'sub', 'niste', 'some', 'those', 'he' ] def creeaza_lista_keywords(titlu): # imparte titlul in 2 in functie de bara verticala | prima_parte_titlu = titlu.split('|')[0] # extrage toate cuvintele din prima parte a titlului keywords = re.findall(r'(?:\w|-*\!)+', prima_parte_titlu) # extrage keyword-urile care nu se gasesc in lista de cuvinte de legatura keywords_OK = list() for keyword in keywords: if keyword not in LISTA_CUVINTE_LEGATURA: # adauga keyword-ul cu litere mici keywords_OK.append(keyword.lower()) # returneaza un string in care toate keyword-urile sunt alaturate prin ', ' return ", ".join(keywords_OK) print('Going through english folder') amount = 1 for file in os.listdir(en1_directory): filename = os.fsdecode(file) print(filename) if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html': continue if filename.endswith(extension_file): with open(os.path.join(english_folder2, filename), encoding='utf-8') as html: html = html.read() try: with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html: en_html = en_html.read() # title to meta try: title = re.search('<title.+/title>', html)[0] title_content = re.search('>(.+)<', title)[1] except: pass try: meta_og_title = re.search('<meta property="og:title".*>', en_html)[0] new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title) en_html = en_html.replace(meta_og_title, new_meta_og_title) except: pass try: meta_keywords = re.search('<meta name="keywords".*>', en_html)[0] keywords = creeaza_lista_keywords(title_content) new_meta_keywords = re.sub(r'content=".+"', f'content="{keywords}"', meta_keywords) en_html = en_html.replace(meta_keywords, new_meta_keywords) except: pass try: meta_abstract = re.search('<meta name="abstract".*>', en_html)[0] new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract) en_html = en_html.replace(meta_abstract, new_meta_abstract) except: pass try: meta_Subject = re.search('<meta name="Subject".*>', en_html)[0] new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject) en_html = en_html.replace(meta_Subject, new_meta_Subject) except: pass try: headline = re.search('"headline":.+', en_html)[0] new_headline = re.sub(r':.+', f': "{title_content}",', headline) en_html = en_html.replace(headline, new_headline) except: pass try: keywords = re.search('"keywords":.+', en_html)[0] new_keywords = re.sub(r':.+', f': "{title_content}",', keywords) en_html = en_html.replace(keywords, new_keywords) except: pass # canonical to meta og:url and @id try: canonical_content = re.search('<link rel="canonical" href="(.+)".*>', html)[1] except: pass try: og_url = re.search('<meta property="og:url".*>', en_html)[0] new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url) en_html = en_html.replace(og_url, new_og_url) except: pass try: id = re.search('"@id":.+', en_html)[0] new_id = re.sub(r':.+', f': "{canonical_content}"', id) en_html = en_html.replace(id, new_id) except: pass # meta description to og:description and description try: meta = re.search('<meta name="description".+>', html)[0] meta_description = re.search('<meta name="description" content="(.+)".*>', html)[1] except: pass try: og_description = re.search('<meta property="og:description".+/>', en_html)[0] new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description) en_html = en_html.replace(og_description, new_og_description) except: pass try: description = re.search('"description":.+', en_html)[0] new_description = re.sub(r':.+', f': "{meta_description}",', description) en_html = en_html.replace(description, new_description) except: pass try: en_html = re.sub('<meta name="description".+/>', meta, en_html) except: pass try: en_html = re.sub('<title.+/title>', title, en_html) except: pass except FileNotFoundError: continue print(f'{filename} parsed ({amount})') amount += 1 if use_parse_folder: try: with open(os.path.join(english_folder2+r'', ''+filename), 'w', encoding='utf-8') as new_html: new_html.write(en_html) except: os.mkdir(english_folder2+r'') with open(os.path.join(english_folder2+r'', ''+filename), 'w', encoding='utf-8') as new_html: new_html.write(en_html) else: with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html: html.write(en_html)
That's all folks.
Also, see this VERSION 2 or VERSION 3 or VERSION 4 or VERSION 5 or VERSION 6 or VERSION 7