U E D R , A S I H C RSS

권영기/web crawler



1.


Python Web Crawler Python , 한 Web Crawler . (툰( , 탑...), 트, ..)


2.


3.



3.2.1.




import urllib
import urllib2

req = urllib2.Request('http://9632024.tistory.com/974')
try: urllib2.urlopen(req)
except URLError, e:
	print e.reason

fo = open("test1.html","w")
for line in urllib2.urlopen(req).readlines():
	fo.write(line)

fo.close()

3.2.2. URL


import urllib
import urllib2
import string

fo1 = open("test1.html", "r")
fo2 = open("test2.html", "w")

for line in fo1.readlines() :
	pos = string.find(line, '"http')
	if pos is not -1 :
		for c in range(pos+1, len(line)) :
			if line[c] is '"' :
				fo2.write("\n")
				break
			fo2.write(line[c])

fo1.close()
fo2.close()

3.2.3.

import urllib
import urllib2


fo = open("test2.html", "r")
for line in fo.readlines():
	urllib.urlretrieve(line,line.split('/')[-1])

fo.close()



Python 2.7.2+ (default, Oct  4 2011, 20:03:08) 
[GCC 4.6.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> first = 1
>>> second = 2
>>> first, second = second, first
>>> print first
2
>>> print second
1
>>> first, second = second, first
>>> third = 3
>>> first, second, third = third, first, second
>>> print first, second, third
3 1 2


3.2.4.


import os

os.chdir(os.getcwd() + '/folder')
def create_dir(folder):
	cdir = os.getcwd()
	mdir = cdir + folder
	print mdir;
	if os.path.isdir(mdir) is  False :
		os.mkdir(mdir , 0755)

type = ['/mp3', '/jpg', '/txt']
for t in type :
	create_dir(t)


  • os.chdir(path) - Change the current working directory to path.
  • os.getcwd() - Return a string representing the current working directory.
  • os.path.isdir(path) - Return True if path is an existing directory.
  • os.mkdir(path, mode) - Create a directory named path with numeric mode mode. If the directory already exists, OSError is raised.

    http://docs.python.org/library/os.html
    http://docs.python.org/library/os.path.html#module-os.path

  • mode -
    d -
    r -
    w -
    x -

    d / rwx / r-x / r-x

r(4)w(2)x(1)
755 -> drwxr-xr-x
http://snowbora.com/343

3.2.5. 툰()



import prepare
import os

currentpath = os.getcwd()
path = os.getcwd() + '/luckyzzang'
if os.path.isdir(path) is False :
	os.mkdir(path, 0755)


os.chdir(path)
currentpath = os.getcwd()


for i in range(1, 21):
	url = 'http://comic.naver.com/webtoon/detail.nhn?titleId=449854&no=' + str(i) + '&weekday=wed'

	path = currentpath + '/' + str(i)
	if os.path.isdir(path) is False :
		os.mkdir(path, 0755)
	
	os.chdir(path)
	
	prepare.readpage(url, str(i) + '.html')
	prepare.extractwt(str(i) + '.html', str(i) + 'file.html')
	prepare.download(str(i) + 'file.html')

3.2.6. wxPython


. GUI 함. ...

3.2.8. Eclipse + PyDev + wxPython + pywin32


1. Eclipse
2. Eclipse, Help > Install New Software > Add > PyDev, Http://pydev.org/updates
3. Aptana ,
4. Window > Preference > PyDev > Interpreter - Python > Auto Config
5. New Folder > /usr/lib/python2.7/dist-packages/wx-2.8-gtk2-unicode

  • Auto Config하 ..

3.3.



Valid XHTML 1.0! Valid CSS! powered by MoniWiki
last modified 2021-02-07 05:28:46
Processing time 0.0457 sec