Alpesh Sorathiya: Natural Language Toolkit (NLTK) python setup for Window

Monday, 1 January 2018

Natural Language Toolkit (NLTK) python setup for Window

How to install/configure NLTK with window system. You can follow below steps :

First of check which Python version has been installed

If you couldn't configured yet, then you can download (https://www.python.org/downloads) & install first from given path.

Let's start installation process. Let install dumpy using following command

Then, install NLTK using following command

Now, download NLTK packages using below command

>>> import nltk

Once you fire download command then it will open a installer dialog from where you have to select your packages to install. You can select all to download all package.

Once download complete. It will show like this

Once installed all packages then you click on close to return to the command prompt.

prompt will show true

Now, you can verify whether it's successfully install or not.

Yeah! It's work great!. You can post your query or installing issue!

4 comments:

Alpesh Sorathiya4 January 2018 at 03:29
Important Links

Use for grammar detection
http://rwet.decontextualize.com/book/textblob/

Beautiful parse
https://www.dataquest.io/blog/web-scraping-tutorial-python/

Beautiful Soup Collecting data
https://www.dataquest.io/blog/web-scraping-beautifulsoup/
ReplyDelete
Replies
Alpesh Sorathiya5 January 2018 at 03:53
Sample for website scraping

import urllib3
import nltk
from bs4 import BeautifulSoup
from nltk.collections import *
import requests
import dateparser

prefixes = ["jan.", "feb.", "mar.", "apr.", "may.", "jun.", "jul.", "sept.", "oct.", "nov.", "dec.", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]

#Fucaton will check wheter it's month valid data or not eg. "Feb. 1:"
#Return 1 = success, 0 = fail
def check_start_with_syntax(p_tags_text):
if p_tags_text.lower().startswith(tuple(prefixes)):
return 1
else:
return 0

#Fuction will parse the date e.g Feb.1, July 29/30
#return start date yyyy/mm/dd hh:mm:ss end date yyyy/mm/dd hh:mm:ss
def prase_event_date(p_tags_text):
event_parse_dt = 'un-formated'
event_parse_dt = p_tags_text[0:p_tags_text.index(':')]
#Check for start date & end date
#Let check date with this formate 29/30
if(event_parse_dt.find('/')==-1):
return dateparser.parse(event_parse_dt), dateparser.parse(event_parse_dt)
else:
event_parse_month_temp = str(event_parse_dt[0:p_tags_text.index(' ')]).strip()
event_parse_days_temp = str(event_parse_dt.replace(event_parse_month_temp,'')).strip()
event_parse_day_split = event_parse_days_temp.split('/')
return dateparser.parse(str(event_parse_month_temp+' '+event_parse_day_split[0])), dateparser.parse(str(event_parse_month_temp+' '+event_parse_day_split[1]))

url = "https://calendar.html"
request = requests.get(url)

#This will print
#print(request)

#This will print the status code >> 200
#print(request.status_code)

reponse_data = request.text
soup = BeautifulSoup(reponse_data,"html.parser")

#Get full article
article_containers = soup.find_all('div',class_ = 'article-content')
print(len(article_containers))

#Get all praragraph
article_all_paragraph = soup.find_all('p')
print(len(article_all_paragraph))

#Find all P
for p_tags in soup.find_all('p'):
#result = check_start_with_syntax(p_tags.get_text())
if(check_start_with_syntax(str(p_tags.get_text()).strip())==1):
print('\n')
start_date, end_date = prase_event_date(str(p_tags.get_text()).strip())
print(start_date)
print(end_date)
print(p_tags.get_text())
else:
#Nothing to do
nothing = 'nothing to do'

ReplyDelete
Replies
Alpesh Sorathiya11 January 2018 at 05:32
import urllib3
import nltk
from bs4 import BeautifulSoup
from nltk.collections import *
import requests
import dateparser

prefixes = ["jan.", "feb.", "mar.", "apr.", "may.", "jun.", "jul.", "sept.", "oct.", "nov.", "dec.", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]

#Fucaton will check wheter it's month valid data or not eg. "Feb. 1:"
#Return 1 = success, 0 = fail
def check_start_with_syntax(p_tags_text):
if p_tags_text.lower().startswith(tuple(prefixes)):
return 1
else:
return 0

#Fuction will parse the date e.g Feb.1, July 29/30
#return start date yyyy/mm/dd hh:mm:ss end date yyyy/mm/dd hh:mm:ss
def prase_event_date(p_tags_text):
event_parse_dt = 'un-formated'
print(p_tags_text)
if(p_tags_text.find(':')==-1):
return dateparser.parse(event_parse_dt), dateparser.parse(event_parse_dt)
else:
event_parse_dt = p_tags_text[0:p_tags_text.find(':')]
#Check for start date & end date
#Let check date with this formate 29/30
if(event_parse_dt.find('/')==-1):
return dateparser.parse(event_parse_dt), dateparser.parse(event_parse_dt)
else:
event_parse_month_temp = str(event_parse_dt[0:p_tags_text.index(' ')]).strip()
event_parse_days_temp = str(event_parse_dt.replace(event_parse_month_temp,'')).strip()
event_parse_day_split = event_parse_days_temp.split('/')
return dateparser.parse(str(event_parse_month_temp+' '+event_parse_day_split[0])), dateparser.parse(str(event_parse_month_temp+' '+event_parse_day_split[1]))

#Let code begin
filepath = 'InputText.txt'
with open(filepath) as fp:
line = fp.readline()
while line:
line = fp.readline()
if(len(line)!=0):
#print(line.strip())
if(check_start_with_syntax(str(line).strip())==1):
print('\n')
start_date, end_date = prase_event_date(str(line).strip())
print(start_date)
print(end_date)
print(str(line).strip())
else:
#Nothing to do
nothing = 'nothing to do'

url = "https://www.space.com/32286-space-calendar.html"
request = requests.get(url)

#This will print
print(request)

#This will print the status code >> 200
#print(request.status_code)

reponse_data = request.text
soup = BeautifulSoup(reponse_data,"html.parser")

#Get full article
article_containers = soup.find_all('div',class_ = 'article-content')
print(len(article_containers))

#Get all praragraph
article_all_paragraph = soup.find_all('p')
print(len(article_all_paragraph))

#Find all P
#for p_tags in soup.find_all('p'):
# #result = check_start_with_syntax(p_tags.get_text())
# if(check_start_with_syntax(str(p_tags.get_text()).strip())==1):
# print('\n')
# start_date, end_date = prase_event_date(str(p_tags.get_text()).strip())
# print(start_date)
# print(end_date)
# print(p_tags.get_text())
# else:
# #Nothing to do
# nothing = 'nothing to do'

ReplyDelete
Replies
Unknown30 January 2018 at 05:57
How to parse words

import nltk
from nltk.util import ngrams
from collections import Counter
from itertools import chain

wordSet="gmt a.m at"
n = 1
ngrams1= ngrams(wordSet.split(" "), n)

#Let code begin
filepath = 'InputText.txt'
with open(filepath) as fp:
line = fp.readline()
while line:
line = fp.readline()
if(len(line)!=0):
#print(line.strip())
ngrams2= ngrams(line.strip().split(" "), n)
counter= Counter(chain(ngrams2,ngrams1))
print([k[0] for k,v in counter.items() if v>1])

ReplyDelete
Replies

Add comment