You can view the full code here: https://pastebin.com/eBmEEFkR
Install Python.
VARIANT 1: The code below deletes all files that are less than 250 characters long
import os import re import random import unidecode import nltk from nltk import tokenize # nltk.download('punkt') import requests from usp.tree import sitemap_tree_for_homepage def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8') as f: text = f.read() f.close() return text FOLDER_LOCAL = 'd:\\Folder1' counter_sterse = 0 for f in os.listdir(FOLDER_LOCAL): if f.endswith('.html') or f.endswith('.htm'): filepath = os.path.join(FOLDER_LOCAL, f) page_html = read_text_from_file(filepath) if len(page_html) < 250: os.remove(filepath) counter_sterse += 1 continue print("S-au sters {} fisiere".format(counter_sterse))
VARIANT 2: The code below deletes all files that are less than 250 characters long in the section below:
<-- START -->
words words words ...etc
<-- FINAL -->
You can view the full code here: https://pastebin.com/au4XD6ce
import os import re import random import unidecode import nltk from nltk import tokenize # nltk.download('punkt') import requests from usp.tree import sitemap_tree_for_homepage def read_text_from_file(file_path): """ Aceasta functie returneaza continutul unui fisier. file_path: calea catre fisierul din care vrei sa citesti """ with open(file_path, encoding='utf8') as f: text = f.read() f.close() return text def write_to_file(text, file_path): """ Aceasta functie scrie un text intr-un fisier. text: textul pe care vrei sa il scrii file_path: calea catre fisierul in care vrei sa scrii """ with open(file_path, 'wb') as f: f.write(text.encode('utf8', 'ignore')) f.close() # 1. Preluare site-uri de pe o anumita pagina (vezi variabila PAGE) FOLDER_LOCAL = 'd:\\Folder1' page_text_pattern = re.compile('<-- START -->([\s\S]*?)<-- FINAL -->') counter_sterse = 0 for f in os.listdir(FOLDER_LOCAL): if f.endswith('.html') or f.endswith('.htm'): filepath = os.path.join(FOLDER_LOCAL, f) page_html = read_text_from_file(filepath) page_text = re.findall(page_text_pattern, page_html) if len(page_text) != 0: page_text = page_text[0] # print(page_text, len(page_text), filepath) if len(page_text) < 2000: os.remove(filepath) counter_sterse += 1 continue print("S-au sters {} fisiere".format(counter_sterse))
That's all folks.
If you like my code, then make me a favor: translate your website into Romanian, "ro".
Also, there is a VERSION 2 of this code or VERSION 3 or VERSION 4 or VERSION 5 or VERSION 6