优化适合密度python计算程序-5118营销头条

# www.5118.com 适合密度计算python程序

import requests

import chardet

import urllib3

import os

import json

import jieba

import re

from bs4 import BeautifulSoup

urllib3.disable_warnings()

from flask import request

from gevent import pywsgi

from flask import Flask

app = Flask(__name__)

@app.route('/midu', methods=['Post'])

def relatewords():

keyword = str(request.values.get('k'))

keyword = keyword.strip()

url = 'http://www.baidu.com/s?wd='+keyword+'&rn=50'

try:

response = requests.get(url)

htmlEncoded = response.content

detectResult = chardet.detect(htmlEncoded)

encoding = detectResult['encoding']

html = str(htmlEncoded, encoding)

# print(html)

soup = BeautifulSoup(html, 'html.parser')

items = soup.select('h3 a')

allTitleStr = ''

if len(items)>0:

allMidu = 0

allCount = 0

keywordWords = jieba.lcut(keyword, cut_all=False)

keywordLen = len(keywordWords)

for item in items:

resultRedirectUrl = item.attrs['href']

if 'http://' in resultRedirectUrl or \

'https://' in resultRedirectUrl:

try:

itemHeadRes = requests.head(resultRedirectUrl, verify=False)

itemUrl = itemHeadRes.headers['Location']

itemRes = requests.get(itemUrl, verify=False)

if itemRes.status_code == 200:

itemHtmlEncoding = chardet.detect(itemRes.content)['encoding']

itemHtml = str(itemRes.content, itemHtmlEncoding, errors='ignore')

match = bodyReg.search(itemHtml)

if match is not None and match.group(1) is not None:

html_body = match.group(1)

html_body_clear = clearReg1.sub('', clearReg.sub('', styleReg.sub('', scriptReg.sub('',aReg.sub('',html_body))))).strip()

if html_body_clear!='':

bodyWords = jieba.lcut(html_body_clear, cut_all=False)

bodyMidu = round(keywordLen / float(len(bodyWords)) * 100, 1)

if bodyMidu<5:

print(bodyMidu)

allMidu = allMidu + bodyMidu

allCount = allCount + 1

except:

continue

resultHtml = json.dumps({"density": str(round(allMidu/allCount, 1))+"%"}, ensure_ascii=False)

else:

resultHtml = '没有找到任何搜索结果'

except:

resultHtml = '读取百度发生错误'

return resultHtml

if __name__ == '__main__':

aReg = re.compile('<a[^>]*>([\\s|\\S]*?)</a>', re.S)

styleReg = re.compile('<style[^>]*>([\\s|\\S]*?)</style>', re.S)

scriptReg = re.compile('<script[^>]*>([\\s|\\S]*?)</script>', re.S)

bodyReg = re.compile('<body[^>]*>([\\s|\\S]*)</body>', re.S)

clearReg = re.compile(r'<[^>]+>', re.S)

clearReg1 = re.compile(r'\s*', re.S)

# rootPath = os.path.dirname(os.path.realpath(__file__))

# userDictPath = os.path.join(rootPath, 'jiebadic.csv')

# jieba.load_userdict(userDictPath)

jieba.initialize()

print('模型加载完毕，建立服务器')

server = pywsgi.WSGIServer(('0.0.0.0', 5162), app)

server.serve_forever()

系统通知