五分鐘帶你玩轉python(一)python入門,爬取圖片,文字,視頻,音頻

      網友投稿 832 2025-03-31

      爬取天氣 并存在數據庫

      #!/usr/bin/Python

      # -*- coding: utf-8 -*-

      import pymysql

      import requests

      from bs4 import BeautifulSoup

      db = pymysql.connect(

      host='localhost',

      port=3306,

      五分鐘帶你玩轉python(一)python入門,爬取圖片,文字,視頻,音頻

      user='root',

      passwd='root',

      db='mysql',

      use_unicode=True,

      charset="utf8"

      )

      cursor = db.cursor()

      def downdata(url):

      hd = {

      'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

      req = requests.get(url, headers=hd)

      # req.encoding = 'utf-8'

      soup = BeautifulSoup(req.text, 'html.parser')

      da_new = soup.find_all('li', class_='ndays-item png-fix cf')

      for da in da_new:

      day = da.find('div', class_='td td2').find('p', class_='p1')

      week = da.find('div', class_='td td2').find('p', class_='p2')

      wd = da.find('div', class_='td td5').find('p', class_='p1')

      fl = da.find('div', class_='td td5').find('p', class_='p2')

      f2 = da.find('div', class_='td td3').find('div')['title']

      print('今天是' + day.text + ',' + '星期' + week.text + ',' + '溫度' + wd.text + ',' + '風力' + fl.text + ',' + '天氣' + f2)

      sql = "INSERT INTO tianiq(day1,week1, wd, fl, air) VALUES ('%s','%s','%s','%s','%s')" % (day.text, week.text, wd.text, fl.text, f2)

      print(sql)

      cursor.execute(sql)

      db.commit()

      downdata('http://tianqi.sogou.com/shenyang/15/')

      爬取漫畫

      #!/usr/bin/Python

      # -*- coding: UTF-8 -*-

      import re

      import urllib.request

      def gethtml(url):

      headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}

      req = urllib.request.Request(url=url, headers=headers)

      html = urllib.request.urlopen(req).read()

      return html

      def getimg(html):

      reg = r'src="(.*?\.jpg)"'

      img=re.compile(reg)

      html=html.decode('utf-8')#python3

      imglist=re.findall(img,html)

      x = 0

      for imgurl in imglist:

      urllib.request.urlretrieve(imgurl,'D:%s.jpg'%x)

      x = x+1

      html=gethtml("http://www.tuku.cc/")

      print(getimg(html))

      調用數據庫

      #!/usr/bin/python

      # -*- coding: UTF-8 -*-

      import pymysql

      # 打開數據庫連接

      db = pymysql.connect("localhost", "root", "root", "mysql")

      # 使用cursor()方法獲取操作游標

      cursor = db.cursor()

      # SQL 插入語句

      sql = "INSERT INTO tianiq(day1, \

      week1, wd, fl, air) \

      VALUES ('Mac', 'Mohan', 'M', 'M', 'M')"

      try:

      # 執行sql語句

      cursor.execute(sql)

      # 執行sql語句

      db.commit()

      print("insert ok")

      except:

      # 發生錯誤時回滾

      db.rollback()

      # 關閉數據庫連接

      db.close()

      爬取視頻

      #!/usr/bin/python

      # -*- coding: UTF-8 -*-

      import re

      import requests

      from bs4 import BeautifulSoup

      def download(url):

      dz = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}

      req = requests.get(url,headers = dz).content

      with open('qq.mp4', 'wb') as fp:

      fp.write(req)

      download('http://video.study.163.com/edu-video/nos/mp4/2017/04/01/1006064693_cc2842f7dc8b410c96018ec618f37ef6_sd.mp4?ak=d2e3a054a6a144f3d98805f49b4f04439064ce920ba6837d89a32d0b0294ad3c1729b01fa6a0b5a3442ba46f5001b48b1ee2fb6240fc719e1b3940ed872a11f180acad2d0d7744336d03591c3586614af455d97e99102a49b825836de913910ef0837682774232610f0d4e39d8436cb9a153bdeea4a2bfbae357803dfb6768a742fe395e87eba0c3e30b7b64ef1be06585111bf60ea26d5dad1f891edd9e94a8e167e0b04144490499ffe31e0d97a0a1babcbd7d2e007d850cc3bf7aa697e8ff')

      爬取音頻

      #!/usr/bin/python

      # -*- coding: UTF-8 -*-

      import json

      import requests

      from bs4 import BeautifulSoup

      def download(url):

      hd = {

      'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

      req = requests.get(url, headers=hd)

      reps = req.text

      result = json.loads(reps)

      datap = result['data']['tracksAudioPlay']

      for index in datap:

      title = index['trackName']

      index['src']

      print(index['src'])

      data = requests.get(index['src'], headers=hd).content

      try:

      with open('%s.mp3' % title, 'wb') as f:

      f.write(data)

      except BaseException:

      print('1')

      download('http://www.ximalaya.com/revision/play/album?albumId=7371372&pageNum=1&sort=-1&pageSize=30')

      爬取文字

      #!/usr/bin/python

      # -*- coding: UTF-8 -*-

      import requests

      from bs4 import BeautifulSoup

      def get_h(url):

      response = requests.get(url)

      response .encoding = 'utf-8'

      return response.text

      def get_c(html):

      soup = BeautifulSoup(html,'html.parser')

      joke_content = soup.select('div.content')[0].getText

      return joke_content

      url_joke = "https://www.qiushibaike.com"

      html = get_h(url_joke)

      joke_content = get_c(html)

      print(joke_content)

      爬取圖片

      #!/usr/bin/python

      # -*- coding: UTF-8 -*-

      import requests

      from bs4 import BeautifulSoup

      import os

      headers = {

      'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}

      url = 'http://www.ivsky.com/'

      start_html = requests.get(url, headers=headers)

      Soup = BeautifulSoup(start_html.text, 'html.parser')

      all_div = Soup.find_all('div', class_='syl_pic')

      for lsd in all_div:

      lsds = 'http://www.ivsky.com' + lsd.find('a')['href']

      title = lsd.find('a').get_text

      print(lsds)

      html = requests.get(lsds, headers=headers)

      Soup_new = BeautifulSoup(html.text, 'html.parser')

      app = Soup_new.find_all('div', class_='il_img')

      for app_new in app:

      apptwo = 'http://www.ivsky.com' + app_new.find('a')['href']

      htmlthree = requests.get(apptwo, headers=headers)

      Soupthree = BeautifulSoup(htmlthree.text, 'html.parser')

      appthree = Soupthree.find('div', class_='pic')

      appf = appthree.find('img')['src']

      name = appf[-9:-4]

      img = requests.get(appf, headers=headers)

      f = open(name + '.jpg', 'ab') ##寫入多媒體文件必須要 b 這個參數??!必須要??!

      f.write(img.content) ##多媒體文件要是用conctent哦!

      f.close()

      爬取小說

      #!/usr/bin/python

      # -*- coding: UTF-8 -*-

      from urllib import request

      from bs4 import BeautifulSoup

      import re

      import sys

      if __name__ == "__main__":

      #創建txt文件

      file = open('一念永恒.txt', 'w', encoding='utf-8')

      #一念永恒小說目錄地址

      target_url = 'http://www.biqukan.com/1_1094/'

      #User-Agent

      head = {}

      head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'

      target_req = request.Request(url = target_url, headers = head)

      target_response = request.urlopen(target_req)

      target_html = target_response.read().decode('gbk','ignore')

      #創建BeautifulSoup對象

      listmain_soup = BeautifulSoup(target_html,'html.parser')

      #搜索文檔樹,找出div標簽中class為listmain的所有子標簽

      chapters = listmain_soup.find_all('div',class_ = 'listmain')

      #使用查詢結果再創建一個BeautifulSoup對象,對其繼續進行解析

      download_soup = BeautifulSoup(str(chapters), 'html.parser')

      #計算章節個數

      numbers = (len(download_soup.dl.contents) - 1) / 2 - 8

      index = 1

      #開始記錄內容標志位,只要正文卷下面的鏈接,最新章節列表鏈接剔除

      begin_flag = False

      #遍歷dl標簽下所有子節點

      for child in download_soup.dl.children:

      #濾除回車

      if child != '\n':

      #找到《一念永恒》正文卷,使能標志位

      if child.string == u"《一念永恒》正文卷":

      begin_flag = True

      #爬取鏈接并下載鏈接內容

      if begin_flag == True and child.a != None:

      download_url = "http://www.biqukan.com" + child.a.get('href')

      download_req = request.Request(url = download_url, headers = head)

      download_response = request.urlopen(download_req)

      download_html = download_response.read().decode('gbk','ignore')

      download_name = child.string

      soup_texts = BeautifulSoup(download_html, 'html.parser')

      texts = soup_texts.find_all(id = 'content', class_ = 'showtxt')

      soup_text = BeautifulSoup(str(texts), 'html.parser')

      write_flag = True

      file.write(download_name + '\n\n')

      #將爬取內容寫入文件

      for each in soup_text.div.text.replace('\xa0',''):

      if each == 'h':

      write_flag = False

      if write_flag == True and each != ' ':

      file.write(each)

      if write_flag == True and each == '\r':

      file.write('\n')

      file.write('\n\n')

      #打印爬取進度

      sys.stdout.write("已下載:%.3f%%" % float(index/numbers) + '\r')

      sys.stdout.flush()

      index += 1

      file.close()

      Python 視頻

      版權聲明:本文內容由網絡用戶投稿,版權歸原作者所有,本站不擁有其著作權,亦不承擔相應法律責任。如果您發現本站中有涉嫌抄襲或描述失實的內容,請聯系我們jiasou666@gmail.com 處理,核實后本網站將在24小時內刪除侵權內容。

      版權聲明:本文內容由網絡用戶投稿,版權歸原作者所有,本站不擁有其著作權,亦不承擔相應法律責任。如果您發現本站中有涉嫌抄襲或描述失實的內容,請聯系我們jiasou666@gmail.com 處理,核實后本網站將在24小時內刪除侵權內容。

      上一篇:【云市場精選】警匪片中常見的大屏作戰指揮中心怎么部署?運籌帷幄的“最強大腦”如何煉成?
      下一篇:excel2016怎樣關閉函數自動計算 關閉函數自動計算功能的設置方法
      相關文章
      亚洲性线免费观看视频成熟| 久久精品国产99国产精品亚洲| 亚洲综合色丁香婷婷六月图片| 亚洲精品免费在线| 亚洲av无码成人黄网站在线观看| 亚洲人成人77777网站| 亚洲色精品vr一区二区三区| 中文字幕在亚洲第一在线| 久久久久久久亚洲精品| 亚洲一级Av无码毛片久久精品| 亚洲国产人成精品| 亚洲精品无码成人片在线观看| 亚洲国产精品成人| 区三区激情福利综合中文字幕在线一区亚洲视频1 | 亚洲一区二区无码偷拍| 亚洲欧美中文日韩视频| 亚洲AV色欲色欲WWW| 久久亚洲精品无码gv| 国产亚洲综合视频| 亚洲片国产一区一级在线观看 | 亚洲欧洲精品视频在线观看| 亚洲国产成+人+综合| 国产日本亚洲一区二区三区| 亚洲综合av一区二区三区| 亚洲av无码专区国产不乱码| 亚洲AV无码一区二三区 | 亚洲成A∨人片在线观看无码| 亚洲一区精品视频在线| 亚洲人成自拍网站在线观看| 精品国产日韩亚洲一区在线| 亚洲日本一区二区三区在线不卡| 久久久久久亚洲精品不卡| 国产亚洲精品美女久久久| 精品亚洲成a人片在线观看| 亚洲冬月枫中文字幕在线看| 亚洲综合小说另类图片动图| 天堂亚洲免费视频| 在线观看国产区亚洲一区成人 | 亚洲日韩乱码中文字幕| 日本亚洲高清乱码中文在线观看| 亚洲精品美女久久久久99小说|