Neculai Fantanaru

Everything Depends On The Leader

Python: (Split) Breaks The Text Into Blocks During Translation

On March 21, 2023
, in
Python Scripts Examples by Neculai Fantanaru

You can view the full code here: https://pastebin.com/iwyRQhGm

# pentru traducere taguri TRUE
# pentru traducere TEXT (FALSE - LANGUAGE (en) - TXT)
# PUNE TOATE DOCUMENTELE DE TRADUS IN FOLDER1
# https://beautiful-soup-4.readthedocs.io/en/latest/   css_soup.find_all("p", class_="strikeout")


import os
from bs4 import BeautifulSoup, NavigableString
import re
import textwrap
from googletrans import Translator
import pprint

base_path = "Folder1"  # AICI SELECTEZI PATH LA ORIGINAL FILES
read_tags = input("Want to read tags: ") # TRUE (html) sau FALSE (txt)
if(read_tags.lower() == "true"):
  read_tags = True
else:
  read_tags = False

input_lang = input("Enter language in which you want to translate: ")

input_extension = input("Enter file extension .txt, .html, etc: ")

def recursively_translate(translator, node, input_lang):
    for entry in node.contents:
        if isinstance(entry, NavigableString):
            contents = entry.string
            if contents.strip() != '':
                try:
                    translation = translator.translate(contents, dest=input_lang)
                    entry.replace_with(translation.text)
                except Exception as e:
                    print("Got error during rec translation {}".format(e))
                    pass
        elif entry != None:
            recursively_translate(translator, entry, input_lang)

def remove_tags(data):
  data = data.replace("<html>" , "\n")
  data = data.replace("</html>" , "\n")
  data = data.replace("<body>" , "\n")
  data = data.replace("</body>" , "\n")
  tags  = re.findall("<(.*?)</", data)
  for tag in tags:
    ch = '>'
    listOfWords = tag.split(ch, 1)
    tag = listOfWords[1]
    data = data.replace(tag,"\n")
  soup = BeautifulSoup(data, 'lxml')
  return soup


translator = Translator()
subfolders = []
# getting names of all pdfs files
for file in os.listdir(base_path):
    if file.endswith(input_extension):
      subfolders.append(file)

## Cod nou care traduce pe bucatele
def traducere_v1_txt(translator, file):
  data = []
  with open(f"{base_path}/{file}" , "r" ,encoding='utf8', errors='ignore') as open_file:
    data = open_file.readlines()
  if len(data) == 0:
    print("{} este gol".format(file))
    return
  file_name = file.replace(".txt","")
  with open(f"Translated_Folder/{file_name}_{input_lang}.txt","w", encoding='utf8') as translation_file:
    for i, paragraph in enumerate(data):
          print("Traducere paragraf {}".format(i))
          lines = textwrap.wrap(paragraph, 4820, break_long_words=False )
          for line in lines:
            try:
                  translated_line = translator.translate(line, dest=input_lang)
                  translated_lines = textwrap.wrap(translated_line.text, 120, break_long_words=False)
                  translation_file.writelines(translated_lines)
            except Exception as e:
                    print(e)
                    return
          translation_file.write("\n")

def traducere_v2_txt(translator, file):
  data = ""
  with open(f"{base_path}/{file}" , "r" ,encoding='utf8', errors='ignore') as open_file:
    data = open_file.read()
  if data == "":
    print("{} este gol".format(file))
    return
  lines = textwrap.wrap(data, 4820, break_long_words=False, )
  file_name = file.replace(".txt","")
  with open(f"Translated_Folder/{file_name}_{input_lang}.txt","w", encoding='utf8') as translation_file:
    for i, line in enumerate(lines):
          print("Traducere linia {}".format(i))
          try:
            translated_line = translator.translate(line, dest=input_lang)
            translated_lines = textwrap.wrap(translated_line.text, 120, break_long_words=False)
            translation_file.writelines(translated_lines)
            translation_file.write("\n")
          except Exception as e:
                  print(e)
                  return

for file in subfolders:
  print(f"Translating {file} ..... \n")
  if(file.endswith(".txt")):
    traducere_v2_txt(translator, file)  # daca vreau varianta identic paragraf, varianta care dureaza. atunci pun v1 in loc de v2
    print("{} a fost tradus".format(file))
  elif(file.endswith(".html")):
    data = ""
    with open(f"{base_path}/{file}" , "r" , encoding='utf8', errors='ignore') as open_file:
      data = open_file.read()
    if data == "":
      print("{} este gol".format(file))
      continue
    lxml1 = str(BeautifulSoup(data, 'lxml'))
    #lxml1 = data
    lxml1 = lxml1.replace("\ufeff" , " ")
    #lxml1 = lxml1.replace("\n" , " ")
    #lxml1 = re.sub(' +', ' ', lxml1)
    if(read_tags == True):
      soup = BeautifulSoup(data, 'lxml')
      title_tag = soup.find("title")
      desc_tag = soup.select_one("div.news_desc > h3")
      # to_p_tag = soup.findAll('p', class_='text_obisnuit')
      to_p_tag = soup.find_all('p', class_='text_obisnuit')
      ist_p_tag = soup.find("p" , class_="text_obisnuit2")
      second_p_tag = soup.find("p" , class_="donoo")


      ist3_p_tag = soup.find("p" , class_="JAGAAA")
      # ist3_p_tag = soup.find("p", {'class': "JAGAAA"})
      # ist3_p_tag = soup.find('p', attr={'class_': 'JAGAAA'})
      # ist3_p_tag = soup.find("p" , attr={'class_': "JAGAAA"})
      # ist3_p_tag = soup.find_all("p", class_="JAGAAA")
      # ist3_p_tag = soup.find("p" , {'class_': "JAGAAA"})

      if(ist3_p_tag == None):
        print("<p class='JAGAAA' /> not found")
      else:
        translated_p = translator.translate(ist3_p_tag.text, dest=input_lang)
        lxml1 = lxml1.replace(ist3_p_tag.text,translated_p.text)



      meta_tag = soup.find("meta")
      if(title_tag ==  None):
        print("Title tag does not found")
      else:
        translated_title = translator.translate(title_tag.text, dest=input_lang)
        lxml1 = lxml1.replace(title_tag.text,translated_title.text)
      if(meta_tag ==  None):
        print("meta tag does not found")
      else:
        translated_meta = translator.translate(meta_tag["content"], dest=input_lang)
        lxml1 = lxml1.replace(meta_tag["content"],translated_meta.text)

      if(ist_p_tag == None):
        print("<p class='text_obisnuit2' /> not found")
      else:
        translated_p = translator.translate(ist_p_tag.text, dest=input_lang)
        lxml1 = lxml1.replace(ist_p_tag.text,translated_p.text)

      if(len(to_p_tag) == 0):
        print("<p class='text_obisnuit' /> not found")
      else:
        for p in to_p_tag:
          recursively_translate(translator, p, input_lang)
        # translated_p = translator.translate(to_p_tag.text, dest=input_lang)
        # lxml1 = lxml1.replace(to_p_tag.text,translated_p.text)


      if(desc_tag == None):
        print("<h3   /> not found")
      else:
        translated_p = translator.translate(desc_tag.text, dest=input_lang)
        lxml1 = lxml1.replace(desc_tag.text,translated_p.text)

      if(second_p_tag == None):
        print("<p class='donoo' /> not found")
      else:
        translated_p_2 = translator.translate(second_p_tag.text, dest=input_lang)
        lxml1= lxml1.replace(second_p_tag.text,translated_p_2.text)

      soup = remove_tags(data)
      text = soup.text
      replace_text = text
      text = text.replace("\ufeff" , " ")
      text = text.replace("\n" , " ")
      text = re.sub(' +', ' ', text)
      text = textwrap.wrap(text, 4800, break_long_words=False, )
      if(len(text) == 1 and text[0] == ''):
        pass
      else:
        translation = ""
        linecount = 0
        for line in text:
          try:
            translated_line = translator.translate(line, dest=input_lang)
            #print("Translated line: ",translated_line.text)
            lxml1 = lxml1.replace(line,translated_line.text)
          except Exception as e:
            print(e)
            continue
          linecount = linecount + 1
        file_name = file.replace(".html","")
        with open(f"Translated_Folder/{file_name}_{input_lang}.html","w", encoding='utf8') as htmlfile:
          htmlfile.write(lxml1)
    else:
      soup = remove_tags(data)
      text = soup.text
      replace_text = text
      text = text.replace("\ufeff" , " ")
      text = text.replace("\n" , " ")
      text = re.sub(' +', ' ', text)
      text = textwrap.wrap(text, 4800, break_long_words=False, )
      if(len(text) == 1 and text[0] == ''):
        print("No text found")
      else:
        translation = ""
        linecount = 0
        for line in text:
          try:
            translated_line = translator.translate(line, dest=input_lang)
            #print("Translated line: ",translated_line.text)
            lxml1 = lxml1.replace(line,translated_line.text)
          except Exception as e:
            print(e)
            continue
          linecount = linecount + 1
        file_name = file.replace(".html","")
        with open(f"Translated_Folder/{file_name}_{input_lang}.html","w", encoding='utf8') as htmlfile:
          htmlfile.write(lxml1)
    print("{} a fost tradus".format(file))
    pass


#dt1 = translator.detect(text)



That's all folks.


Also, see my other Python Scripts ---HERE---

Alatura-te Comunitatii Neculai Fantanaru
The 63 Greatest Qualities of a Leader
Cele 63 de calităţi ale liderului

Why read this book? Because it is critical to optimizing your performance. Because it reveals the main coordinates after that are build the character and skills of the leaders, highlighting what it is important for them to increase their influence.

Leadership - Magic of Mastery
Atingerea maestrului

The essential characteristic of this book in comparison with others on the market in the same domain is that it describes through examples the ideal competences of a leader. I never claimed that it's easy to become a good leader, but if people will...

The Master Touch
Leadership - Magia măiestriei

For some leaders, "leading" resembles more to a chess game, a game of cleverness and perspicacity; for others it means a game of chance, a game they think they can win every time risking and betting everything on a single card.

Leadership Puzzle
Leadership Puzzle

I wrote this book that conjoins in a simple way personal development with leadership, just like a puzzle, where you have to match all the given pieces in order to recompose the general image.

Performance in Leading
Leadership - Pe înţelesul tuturor

The aim of this book is to offer you information through concrete examples and to show you how to obtain the capacity to make others see things from the same angle as you.

Leadership for Dummies
Leadership - Pe înţelesul tuturor

Without considering it a concord, the book is representing the try of an ordinary man - the author - who through simple words, facts and usual examples instills to the ordinary man courage and optimism in his own quest to be his own master and who knows... maybe even a leader.