Bjoern Olausson

monitor_website_links.py
Monday, 23 May 2011 18:25

I recently wanted to check a website for new links which match a REGEX on a regular basis and for convenience receive the result via Mail.

Thanks to Python it took me just some minutes to hack this little script together to monitor a website for new Links matching a REGEX.

Klick "Read more..."  to see and download the script.


 

monitor_website_links.py Version:1.0
Monitor Website for new links which match a REGEX and retrieve the result  via Mail.
 GNU/GPL    2011-05-23   English   Linux  4.37 KB  232

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
#--------------------------------------------------------------------------------
#monitor_website_links.py v1.1, Copyright Bjoern Olausson
#--------------------------------------------------------------------------------
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#To view the license visit
#http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
#or write to
#Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#--------------------------------------------------------------------------------
#--------------------------------------------------------------------------------
import time, random, mechanize, sys, smtplib, os, re
from datetime import timedelta, datetime
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email.MIMEText import MIMEText
from email import Encoders
 
# URL to monitor
URL = "http://www.example.com/"
# Compile a regular expression to search for in the link text
SEARCH_FOR_REGEXP = re.compile("RFC", re.I)
# Remove something e.g. session ID from the URL with this regex
REMOVE_FROM_URL = re.compile("s=.*&")
# Run the script for X days
MONITOR_FOR_DAYS = 30
# How long to sleep between checks (in Seconds)
SLEEP_SECONDS_BETWEEN_CHECKS = 60*30
# Set to False if you do not want to store the results in a file
# Otherwise enter a path to a file e.g. "/home/USER/search_results.txt"
SAVE_TO = False
# Set to False if you do not want to mail the results
# Otherwise enter your mailaddress e.g. "
 This e-mail address is being protected from spambots. You need JavaScript enabled to view it
 "
MAIL_TO = False
# GMAIL username and password
gmail_user = ""
gmail_pwd = ""
 
def mail(text, attach="false"):
	'''http://kutuma.blogspot.com/2007/08/sending-emails-via-gmail-with-python.html'''
	msg = MIMEMultipart()
 
	msg['From'] = gmail_user
	msg['To'] = MAIL_TO
	msg['Subject'] = "Found the following for your search"
 
	msg.attach(MIMEText(text))
 
	if attach != "false":
		part = MIMEBase('application', 'octet-stream')
		part.set_payload(open(attach, 'rb').read())
		Encoders.encode_base64(part)
		part.add_header('Content-Disposition',
			'attachment; filename="%s"' % os.path.basename(attach))
		msg.attach(part)
 
	mailServer = smtplib.SMTP("smtp.gmail.com", 587)
	mailServer.ehlo()
	mailServer.starttls()
	mailServer.ehlo()
	mailServer.login(gmail_user, gmail_pwd)
	mailServer.sendmail(gmail_user, MAIL_TO, msg.as_string())
	# Should be mailServer.quit(), but that crashes...
	mailServer.close()
 
if SAVE_TO:
	f = open(SAVE_TO, 'w')
 
base_url = URL[:URL.rfind("/")]
start_time = datetime.now()
stop_time = start_time + timedelta(days=int(MONITOR_FOR_DAYS))
 
br = mechanize.Browser()
br.set_handle_gzip(False)
br.set_handle_referer(True)
br.set_handle_redirect(True)
br.set_handle_equiv(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
 
UA_DICT = {
	"OPERA": "Opera/9.80 (X11; Linux x86_64; U; Opera Next; en) Presto/2.8.131 Version/11.50 Gentoo",
	"KONQUEROR": "Mozilla/5.0 (compatible; Konqueror/4.5; FreeBSD) KHTML/4.5.4 (like Gecko)",
	"MICROB": "Mozilla/5.0 (X11; U; Linux armv7l; en-GB; rv:1.9.2a1pre) Gecko/20090514 Firefox/3.0 Tablet browser 0.9.7 RX-34",
}
 
MESSAGE = ""
while (datetime.now() < stop_time):
	UAS = random.choice(list(UA_DICT.keys()))
	USER_AGENT_STRING = UA_DICT[UAS]
	br.addheaders = [('User-agent', '%(UAS)s' %{"UAS": USER_AGENT_STRING})]
	try:
		br.open(URL)
		response = br.response().read()
	except Exception, e:
		print e
		mail(e)
	try:
		LINKS_FOUND = br.links(text_regex=SEARCH_FOR_REGEXP)
	except Exception, e:
		print e
		mail(e)
	else:
		LINKTEXT = ["\n".join([LINK.text, base_url+"/"+REMOVE_FROM_URL.sub("", LINK.url)]) for LINK in LINKS_FOUND]
		RESULT = "\n".join(LINKTEXT)
		if MESSAGE != RESULT and len(LINKTEXT):
			MESSAGE = RESULT
			if SAVE_TO:
				f.write(str(datetime.now())+"\n"+MESSAGE+"\n\n")
				f.flush()
			if MAIL_TO:
				mail(str(datetime.now())+"\n"+MESSAGE)
	print "sleeping:", SLEEP_SECONDS_BETWEEN_CHECKS
	time.sleep(float(SLEEP_SECONDS_BETWEEN_CHECKS))
 
Last Updated ( Thursday, 09 June 2011 11:09 )
 

Add comment


Security code
Refresh

Comments

Qt Ambassador

Qt Ambassador

www. is deprecated

Banner

Play OGG

Banner

Gixen

web2sms

Banner