You can use PHP Goutte or Python's BeautifulSoup4 library where you can use CSS Selectors or XPaths as well, whatever you are comfortable with.
Here are some simple examples to get started.
PHP Goutte:
require_once 'vendor/autoload.php';
use Goutte\Client;
$client = new Client();
$resp = $client->request('GET', $url);
foreach ($resp->filter(' your css selector here') as $li) {
// your logic here
}
Python BeautifulSoup example:
import requests
from bs4 import BeautifulSoup
timeout_time = 30;
def tryAgain(passed_url):
try:
page = requests.get(passed_url,headers = random.choice(header), timeout = timeout_time).text
return page
except Exception:
while 1:
print("Trying again the URL:")
print(passed_url)
try:
page = requests.get(passed_url,headers = random.choice(header), timeout = timeout_time).text
print("-------------------------------------")
print("---- URL was successfully scraped ---")
print("-------------------------------------")
return page
except Exception:
time.sleep(20)
continue
header = [{"User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1"},
{"User-Agent":"Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"},
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201"},
{"User-Agent":"Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25"}]
main_url = " your URL here "
main_page_html = tryAgain(main_url)
main_page_soup = BeautifulSoup(main_page_html, "html.parser")
for a in main_page_soup.select(' css selector here '):
print a.select(' your css selector here ')[0].text