Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (Parsing)

Name: Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)
Brand: Neculai Fantanaru
SKU: NFL
Availability: OnlineOnly
Rating: 5 (55 reviews)

On Iunie 16, 2021, in Python Scripts Examples, by Neculai Fantanaru

Urashobora kureba kode yuzuye hano:HTTPS: // passatbin.com/my num5q igitsina gabo 5

Urugero rwa kode ya HTML ruzahindurwa na kode ya Python. Gukoporora inyandiko yavuzwe haruguru kuri dosiye .html, uyikize ahantuC: \ Ububiko1

   

 xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="ro">

Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)
 rel="canonical" href="https://MY-WEBSITE.COM" />
 name="description" content="I LOVE HTML and CSS"/>

 name="keywords" content="abordarea frontala a lucrurilor neelucidate"/>
 name="abstract" content="My laptop works just fine"/>
 name="Subject" content="I think I need a new car."/>
 property="og:url" content="https://otherwebsite.com"/>
 property="og:title" content="Nobody is here?" />
 property="og:description" content="Dance is my passion."/>

Kode ya Powershell hepfo izakoporora ibiri muri tagi ya html kurindi tagi mugusenya amakuru. Ukeneye gusa kuzuza tagina

import requests
import re

# Path to english folder 1
english_folder1 = r"c:\Folder1"

# Path to english folder 2
english_folder2 = r"c:\Folder1"

extension_file = ".html"

use_parse_folder = True #Face folder nou daca pui True, iar daca pui False redenumeste fisierele in acelasi folder

import os

en1_directory = os.fsencode(english_folder1)
en2_directory = os.fsencode(english_folder2)

print('Going through english folder')
for file in os.listdir(en1_directory):
    filename = os.fsdecode(file)
    print(filename)
    if filename == 'y_key_e479323ce281e459.html' or filename == 'TS_4fg4_tr78.html':
        continue
    if filename.endswith(extension_file):
        with open(os.path.join(english_folder1, filename), encoding='utf-8') as html:
            html = html.read()

            try:
                with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html:
                    en_html = en_html.read()

                    if False:  # if True: will Parse also the content that starts from  to  and so on
                        try:
                            comment_body = re.search('.+', html, flags=re.DOTALL)[0]
                            en_html = re.sub('.+', comment_body, en_html, flags=re.DOTALL)
                        except:
                            pass

                        try:
                            comment_body2 = re.search('.+', html, flags=re.DOTALL)[0]
                            en_html = re.sub('.+', comment_body2, en_html, flags=re.DOTALL)
                        except:
                            pass

                        try:
                            comment_body3 = re.search('.+', html, flags=re.DOTALL)[0]
                            en_html = re.sub('.+', comment_body3, en_html, flags=re.DOTALL)
                        except:
                            pass

                    # title to meta
                    try:
                        title = re.search('Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)', html)[0]
                        title_content = re.search('>(.+)<', title)[1]
                    except:
                        pass

                    try:
                        meta_og_title = re.search('', en_html)[0]
                        new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title)
                        en_html = en_html.replace(meta_og_title, new_meta_og_title)
                    except:
                        pass

                    try:
                        meta_keywords = re.search('', en_html)[0]
                        new_meta_keywords = re.sub(r'content=".+"', f'content="{title_content}"', meta_keywords)
                        en_html = en_html.replace(meta_keywords, new_meta_keywords)
                    except:
                        pass

                    try:
                        meta_abstract = re.search('', en_html)[0]
                        new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract)
                        en_html = en_html.replace(meta_abstract, new_meta_abstract)
                    except:
                        pass

                    try:
                        meta_Subject = re.search('', en_html)[0]
                        new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject)
                        en_html = en_html.replace(meta_Subject, new_meta_Subject)
                    except:
                        pass

                    try:
                        headline = re.search('"headline":.+', en_html)[0]
                        new_headline = re.sub(r':.+', f': "{title_content}",', headline)
                        en_html = en_html.replace(headline, new_headline)
                    except:
                        pass

                    try:
                        keywords = re.search('"keywords": "Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)",
                        new_keywords = re.sub(r':.+', f': "{title_content}",', keywords)
                        en_html = en_html.replace(keywords, new_keywords)
                    except:
                        pass

                    # canonical to meta og:url and @id
                    try:
                        canonical_content = re.search('', html)[1]
                    except:
                        pass

                    try:
                        og_url = re.search('', en_html)[0]
                        new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url)
                        en_html = en_html.replace(og_url, new_og_url)
                    except:
                        pass

                    try:
                        id = re.search('"@id":.+', en_html)[0]
                        new_id = re.sub(r':.+', f': "{canonical_content}"', id)
                        en_html = en_html.replace(id, new_id)
                    except:
                        pass

                    # meta description to og:description and description
                    try:
                        meta = re.search(']
                        meta_description = re.search(']
                    except:
                        pass

                    try:
                        og_description = re.search('', en_html)[0]
                        new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description)
                        en_html = en_html.replace(og_description, new_og_description)
                    except:
                        pass

                    try:
                        description = re.search('"description": "Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase) |  Neculai Fantanaru",
                        new_description = re.sub(r':.+', f': "{meta_description}",', description)
                        en_html = en_html.replace(description, new_description)
                    except:
                        pass

                    try:
                        en_html = re.sub(', meta, en_html)
                    except:
                        pass

                    try:
                        en_html = re.sub('Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)', title, en_html)
                    except:
                        pass
            except FileNotFoundError:
                continue

        print(f'{filename} parsed')
        if use_parse_folder:
            try:
                with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
                    new_html.write(en_html)
            except:
                os.mkdir(english_folder2+r'\parsed')
                with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
                    new_html.write(en_html)
        else:
            with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html:
                html.write(en_html)

Bidashoboka. Hano hari imvugo ya regex izahindura "Ijambo ryibanze" kuri page ya HTML, ongeraho koma nyuma ya buri jambo.

Koresha hamwe na TIRESPad ++ -> CTR + F -> Reba: Imvugo isanzwe

SEARCH: (?s)<title>.*?<\/title>.*?<meta\x20name="keywords"\x20content="\K(\w+)|\G[^\w\r\n]+(\w+)  
REPLACE BY:  ?1\l\1:,\x20\l\2

Urashobora kugerageza iyi verisiyo igoye ya code, ikora byinshi: Kugarura amakuru kuva Tag hanyuma uyikoporore kuri tagi Urashobora kubona kode hano:

HTTPS: // Passatbin.com/jm5 Guverinoma 2QS

import requests
import re

# Path to english folder 1

english_folder2 = r"c:\Folder1"

extension_file = ".html"

use_parse_folder = True

import os

en1_directory = os.fsencode(english_folder2)
en2_directory = os.fsencode(english_folder2)

# These connection words will be ignore when parsing data from  tag to <meta keywords> tag</span>
LISTA_CUVINTE_LEGATURA <span style="color: #333333">=</span> [
    <span style="background-color: #fff0f0">'in'</span>, <span style="background-color: #fff0f0">'la'</span>, <span style="background-color: #fff0f0">'unei'</span>, <span style="background-color: #fff0f0">'si'</span>, <span style="background-color: #fff0f0">'sa'</span>, <span style="background-color: #fff0f0">'se'</span>, <span style="background-color: #fff0f0">'de'</span>, <span style="background-color: #fff0f0">'prin'</span>, <span style="background-color: #fff0f0">'unde'</span>, <span style="background-color: #fff0f0">'care'</span>, <span style="background-color: #fff0f0">'a'</span>,
    <span style="background-color: #fff0f0">'al'</span>, <span style="background-color: #fff0f0">'prea'</span>, <span style="background-color: #fff0f0">'lui'</span>, <span style="background-color: #fff0f0">'din'</span>, <span style="background-color: #fff0f0">'ai'</span>, <span style="background-color: #fff0f0">'unui'</span>, <span style="background-color: #fff0f0">'acei'</span>, <span style="background-color: #fff0f0">'un'</span>, <span style="background-color: #fff0f0">'doar'</span>, <span style="background-color: #fff0f0">'tine'</span>,
    <span style="background-color: #fff0f0">'ale'</span>, <span style="background-color: #fff0f0">'sau'</span>, <span style="background-color: #fff0f0">'dintre'</span>, <span style="background-color: #fff0f0">'intre'</span>, <span style="background-color: #fff0f0">'cu'</span>,<span style="background-color: #fff0f0">'ce'</span>, <span style="background-color: #fff0f0">'va'</span>, <span style="background-color: #fff0f0">'fi'</span>, <span style="background-color: #fff0f0">'este'</span>, <span style="background-color: #fff0f0">'cand'</span>, <span style="background-color: #fff0f0">'o'</span>,
    <span style="background-color: #fff0f0">'cine'</span>, <span style="background-color: #fff0f0">'aceasta'</span>, <span style="background-color: #fff0f0">'ca'</span>, <span style="background-color: #fff0f0">'dar'</span>, <span style="background-color: #fff0f0">'II'</span>, <span style="background-color: #fff0f0">'III'</span>, <span style="background-color: #fff0f0">'IV'</span>, <span style="background-color: #fff0f0">'V'</span>, <span style="background-color: #fff0f0">'VI'</span>, <span style="background-color: #fff0f0">'VII'</span>, <span style="background-color: #fff0f0">'VIII'</span>,
    <span style="background-color: #fff0f0">'to'</span>, <span style="background-color: #fff0f0">'was'</span>, <span style="background-color: #fff0f0">'your'</span>, <span style="background-color: #fff0f0">'you'</span>, <span style="background-color: #fff0f0">'is'</span>, <span style="background-color: #fff0f0">'are'</span>, <span style="background-color: #fff0f0">'iar'</span>, <span style="background-color: #fff0f0">'fara'</span>, <span style="background-color: #fff0f0">'aceasta'</span>, <span style="background-color: #fff0f0">'pe'</span>, <span style="background-color: #fff0f0">'tu'</span>,
    <span style="background-color: #fff0f0">'nu'</span>, <span style="background-color: #fff0f0">'mai'</span>, <span style="background-color: #fff0f0">'ne'</span>, <span style="background-color: #fff0f0">'le'</span>, <span style="background-color: #fff0f0">'intr'</span>, <span style="background-color: #fff0f0">'cum'</span>, <span style="background-color: #fff0f0">'e'</span>, <span style="background-color: #fff0f0">'for'</span>, <span style="background-color: #fff0f0">'she'</span>, <span style="background-color: #fff0f0">'it'</span>, <span style="background-color: #fff0f0">'esti'</span>,
	<span style="background-color: #fff0f0">'this'</span>, <span style="background-color: #fff0f0">'that'</span>, <span style="background-color: #fff0f0">'how'</span>, <span style="background-color: #fff0f0">'can'</span>, <span style="background-color: #fff0f0">'t'</span>, <span style="background-color: #fff0f0">'must'</span>, <span style="background-color: #fff0f0">'be'</span>, <span style="background-color: #fff0f0">'the'</span>, <span style="background-color: #fff0f0">'and'</span>, <span style="background-color: #fff0f0">'do'</span>, <span style="background-color: #fff0f0">'so'</span>, <span style="background-color: #fff0f0">'or'</span>, <span style="background-color: #fff0f0">'ori'</span>,
	<span style="background-color: #fff0f0">'who'</span>, <span style="background-color: #fff0f0">'what'</span>, <span style="background-color: #fff0f0">'if'</span>, <span style="background-color: #fff0f0">'of'</span>, <span style="background-color: #fff0f0">'on'</span>, <span style="background-color: #fff0f0">'i'</span>, <span style="background-color: #fff0f0">'we'</span>, <span style="background-color: #fff0f0">'they'</span>, <span style="background-color: #fff0f0">'them'</span>, <span style="background-color: #fff0f0">'but'</span>, <span style="background-color: #fff0f0">'where'</span>, <span style="background-color: #fff0f0">'by'</span>, <span style="background-color: #fff0f0">'an'</span>,
	<span style="background-color: #fff0f0">'on'</span>, <span style="background-color: #fff0f0">'1'</span>, <span style="background-color: #fff0f0">'2'</span>, <span style="background-color: #fff0f0">'3'</span>, <span style="background-color: #fff0f0">'4'</span>, <span style="background-color: #fff0f0">'5'</span>, <span style="background-color: #fff0f0">'6'</span>, <span style="background-color: #fff0f0">'7'</span>, <span style="background-color: #fff0f0">'8'</span>, <span style="background-color: #fff0f0">'9'</span>, <span style="background-color: #fff0f0">'0'</span>, <span style="background-color: #fff0f0">'made'</span>, <span style="background-color: #fff0f0">'make'</span>, <span style="background-color: #fff0f0">'my'</span>, <span style="background-color: #fff0f0">'me'</span>, <span style="background-color: #fff0f0">'-'</span>,
	<span style="background-color: #fff0f0">'vom'</span>, <span style="background-color: #fff0f0">'voi'</span>, <span style="background-color: #fff0f0">'ei'</span>, <span style="background-color: #fff0f0">'cat'</span>, <span style="background-color: #fff0f0">'ar'</span>, <span style="background-color: #fff0f0">'putea'</span>, <span style="background-color: #fff0f0">'poti'</span>, <span style="background-color: #fff0f0">'sunteti'</span>, <span style="background-color: #fff0f0">'inca'</span>, <span style="background-color: #fff0f0">'still'</span>, <span style="background-color: #fff0f0">'noi'</span>, <span style="background-color: #fff0f0">'l'</span>,
	<span style="background-color: #fff0f0">'ma'</span>, <span style="background-color: #fff0f0">'s'</span>, <span style="background-color: #fff0f0">'dupa'</span>, <span style="background-color: #fff0f0">'after'</span>, <span style="background-color: #fff0f0">'under'</span>, <span style="background-color: #fff0f0">'sub'</span>, <span style="background-color: #fff0f0">'niste'</span>, <span style="background-color: #fff0f0">'some'</span>, <span style="background-color: #fff0f0">'those'</span>, <span style="background-color: #fff0f0">'he'</span>
]

<span style="color: #008800; font-weight: bold">def</span> <span style="color: #0066BB; font-weight: bold">creeaza_lista_keywords</span>(titlu):
    <span style="color: #888888"># imparte titlul in 2 in functie de bara verticala |</span>
    prima_parte_titlu <span style="color: #333333">=</span> titlu<span style="color: #333333">.</span>split(<span style="background-color: #fff0f0">'|'</span>)[<span style="color: #0000DD; font-weight: bold">0</span>]
    <span style="color: #888888"># extrage toate cuvintele din prima parte a titlului</span>
    keywords <span style="color: #333333">=</span> re<span style="color: #333333">.</span>findall(<span style="background-color: #fff0f0">r'(?:\w|-*\!)+'</span>, prima_parte_titlu)
    <span style="color: #888888"># extrage keyword-urile care nu se gasesc in lista de cuvinte de legatura</span>
    keywords_OK <span style="color: #333333">=</span> <span style="color: #007020">list</span>()
    <span style="color: #008800; font-weight: bold">for</span> keyword <span style="color: #000000; font-weight: bold">in</span> keywords:
        <span style="color: #008800; font-weight: bold">if</span> keyword <span style="color: #000000; font-weight: bold">not</span> <span style="color: #000000; font-weight: bold">in</span> LISTA_CUVINTE_LEGATURA:
            <span style="color: #888888"># adauga keyword-ul cu litere mici</span>
            keywords_OK<span style="color: #333333">.</span>append(keyword<span style="color: #333333">.</span>lower())
    <span style="color: #888888"># returneaza un string in care toate keyword-urile sunt alaturate prin ', '</span>
    <span style="color: #008800; font-weight: bold">return</span> <span style="background-color: #fff0f0">", "</span><span style="color: #333333">.</span>join(keywords_OK)


<span style="color: #008800; font-weight: bold">print</span>(<span style="background-color: #fff0f0">'Going through english folder'</span>)
amount <span style="color: #333333">=</span> <span style="color: #0000DD; font-weight: bold">1</span>
<span style="color: #008800; font-weight: bold">for</span> <span style="color: #007020">file</span> <span style="color: #000000; font-weight: bold">in</span> os<span style="color: #333333">.</span>listdir(en1_directory):
    filename <span style="color: #333333">=</span> os<span style="color: #333333">.</span>fsdecode(<span style="color: #007020">file</span>)
    <span style="color: #008800; font-weight: bold">print</span>(filename)
    <span style="color: #008800; font-weight: bold">if</span> filename <span style="color: #333333">==</span> <span style="background-color: #fff0f0">'y_key_e479323ce281e459.html'</span> <span style="color: #000000; font-weight: bold">or</span> filename <span style="color: #333333">==</span> <span style="background-color: #fff0f0">'directory.html'</span>:
        <span style="color: #008800; font-weight: bold">continue</span>
    <span style="color: #008800; font-weight: bold">if</span> filename<span style="color: #333333">.</span>endswith(extension_file):
        <span style="color: #008800; font-weight: bold">with</span> <span style="color: #007020">open</span>(os<span style="color: #333333">.</span>path<span style="color: #333333">.</span>join(english_folder2, filename), encoding<span style="color: #333333">=</span><span style="background-color: #fff0f0">'utf-8'</span>) <span style="color: #008800; font-weight: bold">as</span> html:
            html <span style="color: #333333">=</span> html<span style="color: #333333">.</span>read()

            <span style="color: #008800; font-weight: bold">try</span>:
                <span style="color: #008800; font-weight: bold">with</span> <span style="color: #007020">open</span>(os<span style="color: #333333">.</span>path<span style="color: #333333">.</span>join(english_folder2, filename), encoding<span style="color: #333333">=</span><span style="background-color: #fff0f0">'utf-8'</span>) <span style="color: #008800; font-weight: bold">as</span> en_html:
                    en_html <span style="color: #333333">=</span> en_html<span style="color: #333333">.</span>read()

                    <span style="color: #888888"># title to meta</span>
                    <span style="color: #008800; font-weight: bold">try</span>:
                        title <span style="color: #333333">=</span> re<span style="color: #333333">.</span>search(<span style="background-color: #fff0f0">'<title>Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)', html)[0]
                        title_content = re.search('>(.+)<', title)[1]
                    except:
                        pass

                    try:
                        meta_og_title = re.search('', en_html)[0]
                        new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title)
                        en_html = en_html.replace(meta_og_title, new_meta_og_title)
                    except:
                        pass

                    try:
                        meta_keywords = re.search('', en_html)[0]
                        keywords = creeaza_lista_keywords(title_content)
                        new_meta_keywords = re.sub(r'content=".+"', f'content="{keywords}"', meta_keywords)
                        en_html = en_html.replace(meta_keywords, new_meta_keywords)
                    except:
                        pass

                    try:
                        meta_abstract = re.search('', en_html)[0]
                        new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract)
                        en_html = en_html.replace(meta_abstract, new_meta_abstract)
                    except:
                        pass

                    try:
                        meta_Subject = re.search('', en_html)[0]
                        new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject)
                        en_html = en_html.replace(meta_Subject, new_meta_Subject)
                    except:
                        pass

                    try:
                        headline = re.search('"headline":.+', en_html)[0]
                        new_headline = re.sub(r':.+', f': "{title_content}",', headline)
                        en_html = en_html.replace(headline, new_headline)
                    except:
                        pass

                    try:
                        keywords = re.search('"keywords": "Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)",
                        new_keywords = re.sub(r':.+', f': "{title_content}",', keywords)
                        en_html = en_html.replace(keywords, new_keywords)
                    except:
                        pass

                    # canonical to meta og:url and @id
                    try:
                        canonical_content = re.search('', html)[1]
                    except:
                        pass

                    try:
                        og_url = re.search('', en_html)[0]
                        new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url)
                        en_html = en_html.replace(og_url, new_og_url)
                    except:
                        pass

                    try:
                        id = re.search('"@id":.+', en_html)[0]
                        new_id = re.sub(r':.+', f': "{canonical_content}"', id)
                        en_html = en_html.replace(id, new_id)
                    except:
                        pass

                    # meta description to og:description and description
                    try:
                        meta = re.search(']
                        meta_description = re.search(']
                    except:
                        pass

                    try:
                        og_description = re.search('', en_html)[0]
                        new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description)
                        en_html = en_html.replace(og_description, new_og_description)
                    except:
                        pass

                    try:
                        description = re.search('"description": "Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase) |  Neculai Fantanaru",
                        new_description = re.sub(r':.+', f': "{meta_description}",', description)
                        en_html = en_html.replace(description, new_description)
                    except:
                        pass

                    try:
                        en_html = re.sub(', meta, en_html)
                    except:
                        pass

                    try:
                        en_html = re.sub('Nigute wakora ibikoresho byateguwe na Python na Regex kugirango usimbuze HTML Tagi (parase)', title, en_html)
                    except:
                        pass
            except FileNotFoundError:
                continue

        print(f'{filename} parsed ({amount})')
        amount += 1
        if use_parse_folder:
            try:
                with open(os.path.join(english_folder2+r'', ''+filename), 'w', encoding='utf-8') as new_html:
                    new_html.write(en_html)
            except:
                os.mkdir(english_folder2+r'')
                with open(os.path.join(english_folder2+r'', ''+filename), 'w', encoding='utf-8') as new_html:
                    new_html.write(en_html)
        else:
            with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html:
                html.write(en_html)

That's all folks.

If you like my code, please SHARE IT

Urashobora kandi kubona verisiyo ya kode muriUbubashacyangwa izindi python Verisiyo ya 3cyangwaVerisiyo ya 4cyangwaVerisiyo ya 5

Donate via Paypal

RECURRENT DONATION

Donate monthly to support
the NeculaiFantanaru.com project

SINGLE DONATION

Donate the desired amount to support
the NeculaiFantanaru.com project

Donate by Bank Transfer

Account Ron: RO34INGB0000999900448439

Open account at ING Bank

* Note: If you want to read all my articles in real time, please check the romanian version !