Skip to content

Commit 51b7de2

Browse files
committed
add crawler for open data api
1 parent b108b8f commit 51b7de2

File tree

1 file changed

+353
-0
lines changed

1 file changed

+353
-0
lines changed

data/crawler/getInfo.py

Lines changed: 353 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,353 @@
1+
2+
#-*- coding: utf-8 -*-
3+
4+
5+
import sys
6+
import pandas as pd
7+
import requests
8+
import urllib.request
9+
from bs4 import BeautifulSoup
10+
import pymysql
11+
import time
12+
import os
13+
14+
os.environ['http_proxy']=''
15+
16+
17+
18+
19+
20+
21+
def getaptdata(address, yearmonth, key, target ):
22+
if target == "apt":
23+
url = "http://openapi.molit.go.kr:8081/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcAptTrade?LAWD_CD=" + str(address) + "&DEAL_YMD=" + str(yearmonth) +"&serviceKey="+key
24+
elif target == "land":
25+
url = "http://openapi.molit.go.kr/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcLandTrade?LAWD_CD=" +str(address) +"&DEAL_YMD="+str(yearmonth) +"&serviceKey="+key
26+
elif target == "rent":
27+
url = "http://openapi.molit.go.kr:8081/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcAptRent?LAWD_CD=" +str(address) +"&DEAL_YMD="+str(yearmonth) +"&serviceKey="+key
28+
elif target == "villa":
29+
url = "http://openapi.molit.go.kr:8081/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcRHTrade?LAWD_CD=" + str(address) +"&DEAL_YMD="+str(yearmonth) +"&serviceKey="+key
30+
elif target == "villarent":
31+
url = "http://openapi.molit.go.kr:8081/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcRHRent?LAWD_CD=" + str(address) +"&DEAL_YMD="+str(yearmonth) +"&serviceKey="+key
32+
elif target == "studio":
33+
url = "http://openapi.molit.go.kr/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcOffiTrade?LAWD_CD=" + str(address) +"&DEAL_YMD="+str(yearmonth) +"&serviceKey="+key
34+
elif target =="studiorent":
35+
url = "http://openapi.molit.go.kr/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcOffiRent?LAWD_CD=" + str(address) +"&DEAL_YMD="+str(yearmonth) +"&serviceKey="+key
36+
37+
elif target == "single":
38+
url = "http://openapi.molit.go.kr:8081/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcSHTrade?LAWD_CD=" + str(address) +"&DEAL_YMD="+str(yearmonth) +"&serviceKey="+key
39+
else:
40+
startmonth=yearmonth+"01"
41+
endmonth=yearmonth+"12"
42+
if target == "landindex":
43+
url ="http://openapi.kab.co.kr/OpenAPI_ToolInstallPackage/service/rest/LfrPrcIndexSvc/getLfrPrcIndex?startmonth=" + str(startmonth) +"&endmonth="+str(endmonth) + "&region=" + str(address) + "&serviceKey=" + key
44+
elif target == "aptindex":
45+
url ="http://openapi.kab.co.kr/OpenAPI_ToolInstallPackage/service/rest/AptRealPrcIndexSvc/getAptRealPrcIndex?startmonth=" + str(startmonth) +"&endmonth="+str(endmonth) + "&region=" + str(address) + "&serviceKey=" + key
46+
elif target == "rentindex":
47+
url ="http://openapi.kab.co.kr/OpenAPI_ToolInstallPackage/service/rest/RentPrcIndexSvc/getRentPrcIndex?startmonth=" + str(startmonth) +"&endmonth="+str(endmonth) + "&region=" + str(address) + "&serviceKey=" + key
48+
print(url)
49+
try:
50+
f = urllib.request.urlopen(url)
51+
except Exception as e:
52+
print('Fail ' + str(e))
53+
time.sleep(100)
54+
f = urllib.request.urlopen(url)
55+
56+
aptdata2 = f.read().decode("utf8")
57+
f.close()
58+
soup = BeautifulSoup(aptdata2, "lxml")
59+
60+
aptdata = list(aptdata.get_text().replace('\n','').split(">") for aptdata in soup.find_all("item"))
61+
return(aptdata)
62+
63+
def updateIndexTable(aptdata, conn, target, code):
64+
try:
65+
with conn.cursor() as curs:
66+
for i in aptdata:
67+
if "index" in target:
68+
print(i)
69+
70+
rows = str(i[0]).split('|')
71+
for i,r in enumerate(rows):
72+
units = r.split(',')
73+
ym=units[0]
74+
print(ym)
75+
if i==0:
76+
code = ym[:5]
77+
# 수도권
78+
if code == "A2000":
79+
area = ym[5:8]
80+
year = ym[8:12]
81+
month = ym[12:14]
82+
# 강북지역, 강남지역
83+
elif code == "11A01" or code == "11A02":
84+
area = ym[5:9]
85+
year = ym[9:13]
86+
month = ym[13:15]
87+
else:
88+
area = ym[5:7]
89+
year = ym[7:11]
90+
month = ym[11:13]
91+
else:
92+
year = ym[:4]
93+
month = ym[4:6]
94+
index = r.split(',')[1]
95+
date = str(year) + "-" + str(month) + "-01"
96+
curs.execute("""insert into priceIndex (`year`, `month`, `date`, `areacode`, `areacity`, `indexvalue`, `type`) VALUES ( %s, %s, %s, %s, %s, %s, %s)""", (str(year), str(month), str(date), code, area, str(index), target ))
97+
print("""insert into priceIndex (`year`, `month`, `date`, `areacode`, `areacity`, `indexvalue`, `type`) VALUES ( %s, %s, %s, %s, %s, %s, %s)""", (str(year), str(month), str(date), code, area, str(index), target ))
98+
99+
conn.commit()
100+
except pymysql.InternalError as error:
101+
code, message = error.args
102+
print(code)
103+
print( message)
104+
105+
def updateBasicTable(aptdata, conn):
106+
try:
107+
with conn.cursor(pymysql.cursors.DictCursor) as curs:
108+
for i in aptdata:
109+
print(i)
110+
print(len(i))
111+
conn.commit()
112+
except pymysql.InternalError as error:
113+
code, message = error.args
114+
print(code)
115+
print( message)
116+
117+
def checkfield(field, landtype):
118+
try:
119+
if len(field) == 0:
120+
return
121+
122+
if "거래금액" in field:
123+
value = field[:-4].strip().replace(',', '')
124+
return ( "price", value)
125+
elif "보증금" in field:
126+
value = field[:-3].strip().replace(',', '')
127+
return ("deposit", value)
128+
elif "보증금액" in field:
129+
value = field[:-4].strip().replace(',', '')
130+
return ("deposit", value)
131+
elif "월세금액" in field:
132+
value = field[:-4].strip().replace(',', '')
133+
return ("rentprice", value)
134+
elif "월세" in field:
135+
value = field[:-2].strip().replace(',', '')
136+
return ("rentprice", value)
137+
elif "건축년도" in field:
138+
return ( "constructionyear" , field[:-4] )
139+
elif len(field) == 5 and "년" in field:
140+
return ( "year", field[:-1])
141+
elif "단지" in field and (landtype == "studio" or landtype=="studiorent"):
142+
return ( "name", field[:-2].strip())
143+
elif "아파트" in field and (landtype == "apt" or landtype =="rent"):
144+
return ( "name", field[:-3].strip())
145+
elif "연립다세대" in field:
146+
return ( "name", field[:-5].strip())
147+
elif "법정동" in field:
148+
area = field[:-3].strip()
149+
if landtype == "apt":
150+
return ( "area" , area)
151+
else:
152+
return ("areadong", area)
153+
elif "시군구" in field:
154+
return ( "areagu", field[:-3])
155+
elif ( len(field) == 2 or len(field) == 3) and "월" in field:
156+
return ("month", field[:-1])
157+
elif "1~10일" in field or "11~20일" in field or "21~30일" in field or "21~28일" in field or "21~29일" in field or "21~31일" in field:
158+
return ("day", field[:-1])
159+
elif "거래면적" in field:
160+
return ("landarea", field[:-4].strip().replace(',', ''))
161+
elif "대지권면적" in field:
162+
return ("landrightarea", field[:-5])
163+
elif "대지면적" in field:
164+
return ("landarea", field[:-4])
165+
elif "연면적" in field:
166+
return ("totalgroundarea", field[:-3])
167+
elif "주택유형" in field:
168+
return ("housetype", field[:-4])
169+
elif "전용면적" in field:
170+
return ("exclusiveusearea", field[:-4])
171+
elif "지역코드" in field:
172+
return ("areacode", field[:-4])
173+
elif ( len(field) == 2 or len(field) == 3 ) and "층" in field:
174+
return ("floor", field[:-1])
175+
elif "지번" in field:
176+
return ("lotnumber", field[:-2])
177+
elif "구분" in field:
178+
return ("shares", field[:-2])
179+
elif "지목" in field:
180+
return ("category", field[:-2])
181+
elif "용도지역" in field:
182+
return ("subcategory", field[:-4])
183+
elif "건축유형" in field:
184+
return ("housetype", field[:-4])
185+
else:
186+
print("out of scope:%s", field)
187+
return("error", "error")
188+
except Exception as e:
189+
print('Fail ' + str(e))
190+
return ("check field error", "check field error")
191+
192+
193+
def getDate(year, month, day, interval):
194+
day = day.split('~', 1)[0]
195+
if interval == "365":
196+
year = str(int(year) + 1)
197+
date = year + "-" + month + "-" + day
198+
return date
199+
200+
201+
202+
203+
204+
def updateTable(aptdata, conn, target):
205+
try:
206+
with conn.cursor() as curs:
207+
for i in aptdata:
208+
print("\n")
209+
print(i)
210+
entry = {}
211+
for field in i:
212+
if len(field) == 0:
213+
continue
214+
(fieldname, value) = checkfield(field, target)
215+
entry[fieldname] = value
216+
if 'error' in entry:
217+
print("Data error:%s", i)
218+
continue
219+
220+
# 데이타 Fetch
221+
print(curs.rowcount)
222+
print(curs._last_executed)
223+
rows = curs.fetchall()
224+
if curs.rowcount == 0:
225+
# insert query here
226+
print(curs._last_executed)
227+
conn.commit()
228+
except pymysql.InternalError as error:
229+
code, message = error.args
230+
print(code)
231+
print( message)
232+
except Exception as e:
233+
print('Update Table Data Error:' + str(e))
234+
235+
236+
def getQuery(entry, target):
237+
insert = "insert into " + "tablename" + "("
238+
value = " VALUES ("
239+
datavalue = []
240+
for i,unit in enumerate(entry):
241+
if i == 0:
242+
insert += "`" + unit +"`"
243+
value += "%s"
244+
else:
245+
insert += ",`" + unit +"`"
246+
value += ",%s"
247+
datavalue.append(str(entry[unit]))
248+
insertvalue = insert + ")" + value + ")"
249+
datavalues=tuple(datavalue)
250+
print(insertvalue)
251+
print(datavalues)
252+
return(insertvalue, datavalues)
253+
254+
255+
def getDataFrame(aptdata):
256+
blist1 = []
257+
blist2 = []
258+
blist3 = []
259+
blist4 = []
260+
blist5 = []
261+
blist6 = []
262+
blist7 = []
263+
blist8 = []
264+
blist9 = []
265+
blist10 = []
266+
blist11 = []
267+
268+
for i in aptdata:
269+
blist1.append(i[0][:-4])
270+
blist2.append(i[1][:-4])
271+
blist3.append(i[2][:-1])
272+
blist4.append(i[3][:-3])
273+
blist5.append(i[4][:-3])
274+
blist6.append(i[5][:-1])
275+
blist7.append(i[6][:-1])
276+
blist8.append(i[7][:-4])
277+
blist9.append(i[8][:-2])
278+
blist10.append(i[9][:-4])
279+
blist11.append(i[10][:-1])
280+
281+
apt = pd.DataFrame({'건축년도':blist1, '월' : blist2, '법정동':blist3, '년':blist4, '전용면적':blist5, '아파트':blist6, '거래금액':blist7, '일':blist8, '>지번':blist9, '층':blist10, '지역코드':blist11})
282+
apt.columns = ['일', '거래금액','법정동','년','월','건축년도','전용면적','아파트','지번','층', '지역코드']
283+
return(apt)
284+
285+
286+
ilocCodes = ["11000", "26000", "27000", "28000", "29000", "30000", "31000", "41000", "42000", "43000", "44000", "45000", "46000" , "47000", "48000", "49000"]
287+
288+
# 전체 코드
289+
clocCodes = [ "11110","11140","11170","11200","11215","11230","11260","11290","11305","11320","11350","11380","11410","11440","11470","11500","11530","11545","11560","11590","11620","11650","11680","11710","11740", "26110", "26140", "26170", "26200", "26230", "26260", "26290", "26320", "26350", "26380", "26410", "26440", "26470", "26500", "26530", "26710", "27110", "27140", "27170", "27200", "27230", "27260", "27290", "27710", "28110", "28140", "28170", "28185", "28200", "28237", "28245", "28260", "28710", "28720", "29110", "29140", "29155", "29170", "29200", "30110", "30140", "30170", "30200", "30230", "31110", "31140", "31170", "31200", "31710", "36110", "41111", "41113", "41115", "41117", "41131", "41133", "41135", "41150", "41171", "41173", "41190", "41210", "41220", "41250", "41271" , "41273", "41281", "41285", "41287", "41290", "41310","41360","41370","41390","41410","41430","41450","41461", "41463","41465","41480","41500","41550","41570","41590","41610","41630","41650","41670","41800","41820","41830", "42110","42130","42150","42170","42190","42210","42230","42720","42730","42750","42760","42770","42780","42790","42800","42810","42820","42830","43111","43112","43113","43114","43130","43150","43720","43730","43740","43745","43750","43760","43770","43800","44131","44133","44150","44180","44200","44210","44230","44250","44270","44710","44760","44770","44790","44800","44810","44825" , "45111","45113","45130","45140","45180","45190","45210","45710","45720","45730","45740","45750","45770","45790","45800","46110","46130","46150","46170","46230","46710","46720","46730","46770","46780","46790","46800","46810","46820","46830","46840","46860","46870","46880","46890","46900","46910", "47111","47113","47130","47150","47170","47190","47210","47230","47250","47280","47290","47720","47730","47750","47760","47770","47820","47830","47840","47850","47900","47920","47930","47940","48121","48123","48125","48127","48129","48170","48220","48240","48250","48270","48310","48330","48720","48730","48740","48820","48840","48850","48860","48870","48880","48890", "50110", "50130"]
290+
291+
years = [ "2006", "2007", "2008", "2009", "2010","2011", "2012", "2013", "2014", "2015", "2016", "2017" ]
292+
months = [ "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
293+
conn = pymysql.connect(host='localhost', user='userid', password='userpwd',
294+
db='dbname', charset='utf8')
295+
print(len(sys.argv))
296+
key = "add key here"
297+
298+
299+
if len(sys.argv) == 4:
300+
target=sys.argv[1]
301+
code=sys.argv[2]
302+
period=sys.argv[3]
303+
304+
if len(period) == 6:
305+
print("get data by yearmn")
306+
aptdata=getaptdata(code,period, key, target)
307+
updateTable(aptdata, conn, target)
308+
elif len(period) == 4:
309+
print("get data by year")
310+
for m in months:
311+
ym = str(period)+m
312+
aptdata=getaptdata(code,ym, key, target)
313+
updateTable(aptdata, conn, target)
314+
else:
315+
print("usage: code period")
316+
elif len(sys.argv) ==3:
317+
target = sys.argv[1]
318+
code=sys.argv[2]
319+
for y in years:
320+
for m in months:
321+
ym = str(y)+m
322+
aptdata=getaptdata(code,ym, key, target)
323+
updateTable(aptdata, conn, target)
324+
elif len(sys.argv) ==2:
325+
target = sys.argv[1]
326+
print(target)
327+
years = [ "2017" ]
328+
months = [ "10", "11", "12"]
329+
locCodes = clocCodes
330+
if "index" in target:
331+
if target == "rentindex":
332+
ilocCodes = [ "A2000", "11000", "11A01", "11A02", "41000", "28000", "26000", "27000", "29000", "30000", "31000"]
333+
334+
for code in ilocCodes:
335+
for y in years:
336+
key = "H**D"
337+
aptdata=getaptdata(code, y, key, target)
338+
updateIndexTable(aptdata, conn, target, code)
339+
time.sleep(5)
340+
341+
342+
else:
343+
for code in locCodes:
344+
for y in years:
345+
for m in months:
346+
ym = str(y)+m
347+
aptdata=getaptdata(code,ym, key, target)
348+
updateTable(aptdata, conn, target)
349+
time.sleep(1)
350+
time.sleep(1)
351+
else:
352+
print("usage: target code period(Ym or Y) / target code / target ")
353+
conn.close()

0 commit comments

Comments
 (0)