import os
import re
import codecs
import mmap
from HTMLParser import HTMLParser
DEFAULT_BLOCK_SIZE = 7500
f = codecs.open("[url="https://dumps.wikimedia.org/itwiki/20151202/"]wiki.xml", "r", "utf-8")
f_in = mmap.mmap(f.fileno(), 0, access = mmap.ACCESS_READ)
f_out = codecs.open("f_wiki.txt", "w", "utf-8")
f_control = codecs.open("control.txt", "w", "utf-8")
end_file = os.stat(f.name).st_size
now = 0
now_2 = 0
buff = 0
while now < end_file:
block = (DEFAULT_BLOCK_SIZE if ((end_file - now) > DEFAULT_BLOCK_SIZE)
else (end_file - now))
buff = f_in.read(block)
start = re.search(r"(<text.+?>)", buff)
if start:
f_control.write(buff[:start.span()[1]])
buff = buff[start.span()[1] :]
now += block
now_2 = 0
while True:
end = re.search("<\/text>", buff[now_2 :])
if end:
_end = (end.span()[0] + now_2, end.span()[1] + now_2)
buff = buff[0 : _end[0]]
f_control.write(buff[_end[0] : _end[1]])
try:
buff = HTMLParser().unescape(buff.decode("utf-8"))
f_out.write(buff)
except:
print("\n\n8 - now: " + str(now) +
"\nstart: " + str(start.span()) +
" - end: " + str(_end) +
"\nblock: " + str(block) +
" - block_2: " + str(block_2))
raise
finally:
now += _end[1]
f_in.seek(now, 0)
break
if(now + now_2 < end_file):
block_2 = (DEFAULT_BLOCK_SIZE
if ((end_file - now) > DEFAULT_BLOCK_SIZE)
else (end_file - now))
buff += f_in.read(block_2)
now_2 += block_2 - 15
else:
now += now_2
break
else:
f_in.seek(-50, 1)
now += block - 50
f_control.write(buff[0 : (len(buff) - 50)])
buff = 0
f_control.flush()
f_control.close()
f_out.flush()
f_out.close()
f_in.close()
if not(now == end_file):
print("Il programma è terminato senza completare l'elaborazione del file!")