# www.5118.com 适合密度计算python程序
import requests
import chardet
import urllib3
import os
import json
import jieba
import re
from bs4 import BeautifulSoup
urllib3.disable_warnings()
from flask import request
from gevent import pywsgi
from flask import Flask
app = Flask(__name__)
@app.route('/midu', methods=['Post'])
def relatewords():
keyword = str(request.values.get('k'))
keyword = keyword.strip()
url = 'http://www.baidu.com/s?wd='+keyword+'&rn=50'
try:
response = requests.get(url)
htmlEncoded = response.content
detectResult = chardet.detect(htmlEncoded)
encoding = detectResult['encoding']
html = str(htmlEncoded, encoding)
# print(html)
soup = BeautifulSoup(html, 'html.parser')
items = soup.select('h3 a')
allTitleStr = ''
if len(items)>0:
allMidu = 0
allCount = 0
keywordWords = jieba.lcut(keyword, cut_all=False)
keywordLen = len(keywordWords)
for item in items:
resultRedirectUrl = item.attrs['href']
if 'http://' in resultRedirectUrl or \
'https://' in resultRedirectUrl:
try:
itemHeadRes = requests.head(resultRedirectUrl, verify=False)
itemUrl = itemHeadRes.headers['Location']
itemRes = requests.get(itemUrl, verify=False)
if itemRes.status_code == 200:
itemHtmlEncoding = chardet.detect(itemRes.content)['encoding']
itemHtml = str(itemRes.content, itemHtmlEncoding, errors='ignore')
match = bodyReg.search(itemHtml)
if match is not None and match.group(1) is not None:
html_body = match.group(1)
html_body_clear = clearReg1.sub('', clearReg.sub('', styleReg.sub('', scriptReg.sub('',aReg.sub('',html_body))))).strip()
if html_body_clear!='':
bodyWords = jieba.lcut(html_body_clear, cut_all=False)
bodyMidu = round(keywordLen / float(len(bodyWords)) * 100, 1)
if bodyMidu<5:
print(bodyMidu)
allMidu = allMidu + bodyMidu
allCount = allCount + 1
except:
continue
resultHtml = json.dumps({"density": str(round(allMidu/allCount, 1))+"%"}, ensure_ascii=False)
else:
resultHtml = '没有找到任何搜索结果'
except:
resultHtml = '读取百度发生错误'
return resultHtml
if __name__ == '__main__':
aReg = re.compile('<a[^>]*>([\\s|\\S]*?)</a>', re.S)
styleReg = re.compile('<style[^>]*>([\\s|\\S]*?)</style>', re.S)
scriptReg = re.compile('<script[^>]*>([\\s|\\S]*?)</script>', re.S)
bodyReg = re.compile('<body[^>]*>([\\s|\\S]*)</body>', re.S)
clearReg = re.compile(r'<[^>]+>', re.S)
clearReg1 = re.compile(r'\s*', re.S)
# rootPath = os.path.dirname(os.path.realpath(__file__))
# userDictPath = os.path.join(rootPath, 'jiebadic.csv')
# jieba.load_userdict(userDictPath)
jieba.initialize()
print('模型加载完毕,建立服务器')
server = pywsgi.WSGIServer(('0.0.0.0', 5162), app)
server.serve_forever()