Puteţi vizualiza întregul cod aici: https://pastebin.com/iwyRQhGm
# pentru traducere taguri TRUE # pentru traducere TEXT (FALSE - LANGUAGE (en) - TXT) # PUNE TOATE DOCUMENTELE DE TRADUS IN FOLDER1 # https://beautiful-soup-4.readthedocs.io/en/latest/ css_soup.find_all("p", class_="strikeout") import os from bs4 import BeautifulSoup, NavigableString import re import textwrap from googletrans import Translator import pprint base_path = "Folder1" # AICI SELECTEZI PATH LA ORIGINAL FILES read_tags = input("Want to read tags: ") # TRUE (html) sau FALSE (txt) if(read_tags.lower() == "true"): read_tags = True else: read_tags = False input_lang = input("Enter language in which you want to translate: ") input_extension = input("Enter file extension .txt, .html, etc: ") def recursively_translate(translator, node, input_lang): for entry in node.contents: if isinstance(entry, NavigableString): contents = entry.string if contents.strip() != '': try: translation = translator.translate(contents, dest=input_lang) entry.replace_with(translation.text) except Exception as e: print("Got error during rec translation {}".format(e)) pass elif entry != None: recursively_translate(translator, entry, input_lang) def remove_tags(data): data = data.replace("<html>" , "\n") data = data.replace("</html>" , "\n") data = data.replace("<body>" , "\n") data = data.replace("</body>" , "\n") tags = re.findall("<(.*?)</", data) for tag in tags: ch = '>' listOfWords = tag.split(ch, 1) tag = listOfWords[1] data = data.replace(tag,"\n") soup = BeautifulSoup(data, 'lxml') return soup translator = Translator() subfolders = [] # getting names of all pdfs files for file in os.listdir(base_path): if file.endswith(input_extension): subfolders.append(file) ## Cod nou care traduce pe bucatele def traducere_v1_txt(translator, file): data = [] with open(f"{base_path}/{file}" , "r" ,encoding='utf8', errors='ignore') as open_file: data = open_file.readlines() if len(data) == 0: print("{} este gol".format(file)) return file_name = file.replace(".txt","") with open(f"Translated_Folder/{file_name}_{input_lang}.txt","w", encoding='utf8') as translation_file: for i, paragraph in enumerate(data): print("Traducere paragraf {}".format(i)) lines = textwrap.wrap(paragraph, 4820, break_long_words=False ) for line in lines: try: translated_line = translator.translate(line, dest=input_lang) translated_lines = textwrap.wrap(translated_line.text, 120, break_long_words=False) translation_file.writelines(translated_lines) except Exception as e: print(e) return translation_file.write("\n") def traducere_v2_txt(translator, file): data = "" with open(f"{base_path}/{file}" , "r" ,encoding='utf8', errors='ignore') as open_file: data = open_file.read() if data == "": print("{} este gol".format(file)) return lines = textwrap.wrap(data, 4820, break_long_words=False, ) file_name = file.replace(".txt","") with open(f"Translated_Folder/{file_name}_{input_lang}.txt","w", encoding='utf8') as translation_file: for i, line in enumerate(lines): print("Traducere linia {}".format(i)) try: translated_line = translator.translate(line, dest=input_lang) translated_lines = textwrap.wrap(translated_line.text, 120, break_long_words=False) translation_file.writelines(translated_lines) translation_file.write("\n") except Exception as e: print(e) return for file in subfolders: print(f"Translating {file} ..... \n") if(file.endswith(".txt")): traducere_v2_txt(translator, file) # daca vreau varianta identic paragraf, varianta care dureaza. atunci pun v1 in loc de v2 print("{} a fost tradus".format(file)) elif(file.endswith(".html")): data = "" with open(f"{base_path}/{file}" , "r" , encoding='utf8', errors='ignore') as open_file: data = open_file.read() if data == "": print("{} este gol".format(file)) continue lxml1 = str(BeautifulSoup(data, 'lxml')) #lxml1 = data lxml1 = lxml1.replace("\ufeff" , " ") #lxml1 = lxml1.replace("\n" , " ") #lxml1 = re.sub(' +', ' ', lxml1) if(read_tags == True): soup = BeautifulSoup(data, 'lxml') title_tag = soup.find("title") desc_tag = soup.select_one("div.news_desc > h3") # to_p_tag = soup.findAll('p', class_='text_obisnuit') to_p_tag = soup.find_all('p', class_='text_obisnuit') ist_p_tag = soup.find("p" , class_="text_obisnuit2") second_p_tag = soup.find("p" , class_="donoo") ist3_p_tag = soup.find("p" , class_="JAGAAA") # ist3_p_tag = soup.find("p", {'class': "JAGAAA"}) # ist3_p_tag = soup.find('p', attr={'class_': 'JAGAAA'}) # ist3_p_tag = soup.find("p" , attr={'class_': "JAGAAA"}) # ist3_p_tag = soup.find_all("p", class_="JAGAAA") # ist3_p_tag = soup.find("p" , {'class_': "JAGAAA"}) if(ist3_p_tag == None): print("<p class='JAGAAA' /> not found") else: translated_p = translator.translate(ist3_p_tag.text, dest=input_lang) lxml1 = lxml1.replace(ist3_p_tag.text,translated_p.text) meta_tag = soup.find("meta") if(title_tag == None): print("Title tag does not found") else: translated_title = translator.translate(title_tag.text, dest=input_lang) lxml1 = lxml1.replace(title_tag.text,translated_title.text) if(meta_tag == None): print("meta tag does not found") else: translated_meta = translator.translate(meta_tag["content"], dest=input_lang) lxml1 = lxml1.replace(meta_tag["content"],translated_meta.text) if(ist_p_tag == None): print("<p class='text_obisnuit2' /> not found") else: translated_p = translator.translate(ist_p_tag.text, dest=input_lang) lxml1 = lxml1.replace(ist_p_tag.text,translated_p.text) if(len(to_p_tag) == 0): print("<p class='text_obisnuit' /> not found") else: for p in to_p_tag: recursively_translate(translator, p, input_lang) # translated_p = translator.translate(to_p_tag.text, dest=input_lang) # lxml1 = lxml1.replace(to_p_tag.text,translated_p.text) if(desc_tag == None): print("<h3 /> not found") else: translated_p = translator.translate(desc_tag.text, dest=input_lang) lxml1 = lxml1.replace(desc_tag.text,translated_p.text) if(second_p_tag == None): print("<p class='donoo' /> not found") else: translated_p_2 = translator.translate(second_p_tag.text, dest=input_lang) lxml1= lxml1.replace(second_p_tag.text,translated_p_2.text) soup = remove_tags(data) text = soup.text replace_text = text text = text.replace("\ufeff" , " ") text = text.replace("\n" , " ") text = re.sub(' +', ' ', text) text = textwrap.wrap(text, 4800, break_long_words=False, ) if(len(text) == 1 and text[0] == ''): pass else: translation = "" linecount = 0 for line in text: try: translated_line = translator.translate(line, dest=input_lang) #print("Translated line: ",translated_line.text) lxml1 = lxml1.replace(line,translated_line.text) except Exception as e: print(e) continue linecount = linecount + 1 file_name = file.replace(".html","") with open(f"Translated_Folder/{file_name}_{input_lang}.html","w", encoding='utf8') as htmlfile: htmlfile.write(lxml1) else: soup = remove_tags(data) text = soup.text replace_text = text text = text.replace("\ufeff" , " ") text = text.replace("\n" , " ") text = re.sub(' +', ' ', text) text = textwrap.wrap(text, 4800, break_long_words=False, ) if(len(text) == 1 and text[0] == ''): print("No text found") else: translation = "" linecount = 0 for line in text: try: translated_line = translator.translate(line, dest=input_lang) #print("Translated line: ",translated_line.text) lxml1 = lxml1.replace(line,translated_line.text) except Exception as e: print(e) continue linecount = linecount + 1 file_name = file.replace(".html","") with open(f"Translated_Folder/{file_name}_{input_lang}.html","w", encoding='utf8') as htmlfile: htmlfile.write(lxml1) print("{} a fost tradus".format(file)) pass #dt1 = translator.detect(text)
That's all folks.
If you like my code, then make me a favor: translate your website into Romanian, "ro".
Also, you can see other Python Codes: HERE