# -*- coding: utf-8 -*-
'''
Created on 24.07.2009
Паук, v.1.2
Программа выполняет следующие функции:
1) сканирует найденные онлайнером хосты и забивает в базу найденные файлы
2) проверяет их наличие в базе
v 1.1:
1) количество запросов снизилось в 6-7 раз
2) если файл удален на ftp, он удален и в базе
v 1.2
1) переписано 60% кода
2) паук научился нормально забивать русские слова
3) разделены базы для более быстрого поиска
4) добавлены новые форматы
5) исправлены ошибки с удалением несуществующих файлов
6) добавлены обработчики потенциально уязвимых для падения мест
@author: yaroslav
'''
import MySQLdb
from string import lower
from string import find
import urllib2
from urllib import urlretrieve
import time
import ftplib
import datetime
import re
import chardet
def parsequery(s):
g = chardet.detect(s)
encoding = g['encoding']
if encoding != 'utf-8':
encoding = '1251'
if type(encoding) is not str:
encoding = '1251'
regwords = re.compile('[A-Za-z0-9а-яА-Я]{3,100}'.decode('utf-8','ignore'))
s = s.decode(encoding,'ignore')
querystr = ''
try:
querywords = []
words = regwords.findall(s)
for word in words:
newword0 = word
while len(newword0) > 2:
newword = newword0
while len(newword) > 2:
if find(querystr,' '+newword+' ') == -1:
querystr += ' '+newword
newword = newword[1:]
newword0 = newword0[:-1]
return querystr[1:].encode('utf-8','ignore')
except:
return ''
def encode1(str):
str = str.replace('%','%1')
str = str.replace("'",'%2')
return str.replace('"','%3')
def mon(month):
if month == 'Jan':
return '01'
elif month == 'Feb':
return '02'
elif month == 'Mar':
return '03'
elif month == 'Apr':
return '04'
elif month == 'May':
return '05'
elif month == 'Jun':
return '06'
elif month == 'Jul':
return '07'
elif month == 'Aug':
return '08'
elif month == 'Sep':
return '09'
elif month == 'Oct':
return '10'
elif month == 'Nov':
return '11'
elif month == 'Dec':
return '12'
def getgroup(ext):
if ext in ('.avi','.264','.xvid','.wtv','.wmv','.vid','.rv','.mv4','.mpg4','.mpg','.mpeg','.mp4','.mov','.mkv','.m4v','.hdmov','.flv','.divx','.bik','.3gp'):
return 'video'
elif ext in('.ac3','.cda','.fla','.flac','.mp3','.mpa','.ogg','.ram','.wav','.wma','.mid','.aac'):
return 'music'
elif ext in ('.gp3','.gp4'):
return 'guitar'
elif ext in ('.srt','.smi','.smil','.s2k','.ssa','.ass'):
return 'sub'
elif ext in ('.txt','.log','.ini','.cfg'):
return 'text'
elif ext in ('.xis','.xl','.xls','.xlt','.sxc'):
return 'excel'
elif ext in ('.doc','.docx','.rtf','.odt'):
return 'doc'
elif ext in ('.com','.msi','.exe'):
return 'exec'
elif ext in ('.jpg','.bmp','.jpeg','.png','.gif','.psd'):
return 'img'
elif ext in ('.pdf'):
return 'pdf'
elif ext in ('.7zip','.rar','.zip','.7-zip','.bz2','.gzip','.jar','.pak','.pk3','.pk4','.tar','.gz'):
return 'archive'
elif ext in ('.iso','.mdf','.mds','.nrv','.nrg','.dmg'):
return 'disk'
else:
return 'other'
def reconnect(host,user,passwd):
global ftp
try:
ftp.close()
except:
pass
trying = 1
while(trying < 3):
try:
ftp = ftplib.FTP(host,user,passwd,21,3)
ftp.login()
trying = 7
except:
time.sleep(1)
trying += 1
return (trying == 7)
def mysql_reconnect():
global db
global cursor
trying = 0
while(trying < 3):
try:
db = MySQLdb.connect('192.168.78.103','root','rfnf255','nws')
cursor = db.cursor()
trying = 7
except:
time.sleep(1)
trying += 1
return (trying == 7)
def parseftpfile(file):
global broken
filename = ''
ext = ''
name = ''
date = ''
size = '0'
type = 'other'
#есть 2 вида фтп: показывающие дату в начале и в середине
try:
int(file[:2])
parser = 1
except:
parser = 2
if parser == 1:
month = mon(file[:2])
if not month:
month = file[:2]
day = file[3:5]
year = file[6:8]
if int(year) < 50:
year = '20'+year
else:
year = '19'+year
hour = file[9:11]
min = file[12:14]
ampm = file[14:16]
if ampm == 'PM':
hour = str(int(hour)+12)
if ampm == 'AM':
if hour == '12':
hour = '00'
date = year+'-'+month+'-'+day+' '+hour+':'+min+':00'
file1 = file[17:].lstrip()
if find(file1,'<DIR>') != -1:
type = 'dir'
file1 = file1[5:].lstrip()
size = '0'
name = file1.rstrip()
else:
type = 'file'
sizestr = file1[:find(file1,' ')].lstrip('-')
if int(sizestr):
size = sizestr
else:
size = '0'
name = file1[len(sizestr)+1:].lstrip()
#поскольку при больших размерах файлов дата в строке смещается, берем от дефолтного значения и вырезаем
# все циферки и пробелы спереди
if parser == 2:
file1 = file[find(file,' ')+1:].lstrip()
file1 = file1[find(file1,' ')+1:].lstrip()
file1 = file1[find(file1,' ')+1:].lstrip()
file1 = file1[find(file1,' ')+1:].lstrip()
size = file1[:find(file1,' ')]
file1 = file1[find(file1,' '):].lstrip()
date = file1[:12]
#date = file1[find(file1,' ')+1:find(file1,' ')+13]
name = file1[find(file1,' ',12)+1:]
if date[9:10] == ':':
hour = date[7:9]
min = date[10:12]
day = ('0'+date[4:6].strip())[-2:]
month = mon(date[0:3])
year = str(datetime.date.today().year)
try:
date = year+'-'+month+'-'+day+' '+hour+':'+min+':00'
except:
date = '1976-06-06 06:06:06'
else:
day = ('0'+date[3:6].strip())[-2:]
month = mon(date[0:3])
year = date[6:12].strip()
try:
date = year+'-'+month+'-'+day+' 00:00:00'
except:
date = '1976-06-06 06:06:06'
if file[:1] == 'd':
type = 'dir'
else:
type = 'file'
#print name,date,size,type
return [name,date,size,type]
def readdir(host, user, passwd, dir, k):
global ftp
global broken
list = []
try:
if k != 0:
ftp.cwd(dir)
ftp.dir(list.append)
except:
print dir, 'no!!!'
if reconnect(host,user,passwd):
try:
if k != 0:
ftp.cwd(dir)
ftp.dir(list.append)
except:
broken = True
else:
broken = True
#print 'list - ', list
SELECTQ_CHK = "select filename,ext,format from filebase where host = '"+host+"' and user='"+user+"' and password='"+passwd+"' and dir='"+encode1(dir)+"';";
files_base = []
try:
cursor.execute(SELECTQ_CHK)
filebase = cursor.fetchall()
except:
if mysql_reconnect():
try:
cursor.execute(SELECTQ_CHK)
filebase = cursor.fetchall()
except:
broken = True
pass
else:
broken = True
# первращаем touple в массив
files_base = []
for item in filebase:
files_base.append(item)
print host,dir
if not broken:
for file in list:
#если обломился 1 раз - пусть попробует на всех остальных файлах в папках, мало ли.
broken = False
new = True
fileparsed = parseftpfile(file)
if not broken:
name = fileparsed[0]
date = fileparsed[1]
size = fileparsed[2]
type = fileparsed[3]
#print dir,name
INSERTQ = ''
INSERTQ1 = ''
SELECTQ = ''
ext = ''
filegroup = 'other'
if type == 'dir':
if name not in ('.','..'):
filegroup = 'dir'
readdir(host,user,passwd,dir+'/'+name,k+1)
else:
new = False
else:
if name.rfind('.') != -1:
ext = name[name.rfind('.'):]
name = name[:name.rfind('.')]
filegroup = getgroup(lower(ext))
#распознаем что файл в базе
if new and (encode1(name),encode1(ext),filegroup) in files_base:
files_base.remove((encode1(name),encode1(ext),filegroup))
new = False
if new:
MAXQ = "select max(indx) from filebase"
haveinfo = False
Indx = 1
try:
cursor.execute(MAXQ)
miniresult = cursor.fetchall()
if miniresult[0][0] != None:
Indx = miniresult[0][0] + 1
haveinfo = True
except:
pass
if haveinfo:
print 'key'
INSERTQ = "Insert into filebase values ("+str(Indx)+",'"+host+"','"+encode1(dir)+"','"+encode1(name)+"','"+date+"','"+encode1(ext)+"',"+size+",0,'"+filegroup+"',0,'"+user+"','"+passwd+"',0,0);"
INSERTQ1 = "Insert into mask values ('"+parsequery(name)+"',"+str(Indx)+");"
'''
print
print '----------'
print file
print 'host: '+host
print 'dir: '+dir
print 'filename: '+name
print 'date: '+date
print 'ext: '+ext
print 'size: '+size
print 'INSERTQ = ' + INSERTQ
print 'INSERTQ1 = ' + INSERTQ1
'''
try:
cursor.execute(INSERTQ)
cursor.execute(INSERTQ1)
except:
pass
if files_base != [] and not broken:
for file in files_base:
if file[2] != 'dir':
SELECTQ = "select indx from filebase where host = '"+host+"' and user='"+user+"' and password='"+passwd+"' and dir='"+encode1(dir)+"' and filename='"+encode1(file[0])+"' and ext='"+encode1(file[1])+"' and format='"+file[2]+"' LIMIT 0,1;"
else:
SELECTQ = "select indx from filebase where host = '"+host+"' and user='"+user+"' and password='"+passwd+"' and concat(dir,'/',filename,ext) like '"+encode1(dir)+'/'+encode1(file[0])+"%';"
haveinfo = False
try:
print SELECTQ
cursor.execute(SELECTQ)
miniresult = cursor.fetchall()
for miniitem in miniresult:
Indx = miniitem[0]
DELETEQ = "delete from filebase where indx = "+str(Indx)
DELETEQ2 = "delete from mask where indx = "+str(Indx)
cursor.execute(DELETEQ)
cursor.execute(DELETEQ2)
except:
pass
if broken:
#print dir,filename
print '---------------'
print 'host is broken'
time.sleep(0.1)
Startpoint = '192.168.70.141'
n = 0
while n<1:
try:
db = MySQLdb.connect('192.168.78.103','root','rfnf255','nws')
cursor = db.cursor()
except:
pass
hostlist = []
SELECTQ = "Select * from online"
if Startpoint != '':
SELECTQ = "Select * from online where IP >= '"+Startpoint+"'"
Startpoint = ''
try:
cursor.execute(SELECTQ)
answer = cursor.fetchall()
except:
pass
#!!!!!!!!!!!!!!!!!!!1111
#n = 1000
#answer = ((2,'192.168.70.141',3,4,5,'',''),)
for item in answer:
trying = 0
broken = True
while trying < 3:
try:
ftp = ftplib.FTP(item[1],item[5],item[6],21,3)
if item[5] == '':
ftp.login()
trying = 4
broken = False
except:
trying += 1
if not broken:
readdir(item[1],item[5],item[6],'',0)