WSMA Lab Manual 2
WSMA Lab Manual 2
Course Objectives: Exposure to various web and social media analytic techniques.
Course Outcomes:
1. Knowledge on decision support systems.
2. Apply natural language processing concepts on text analytics.
3. Understand sentiment analysis.
4. Knowledge on search engine optimization and web analytics.
List of Experiments
1. Preprocessing text document using NLTK of Python
a. Stopword elimination
b. Stemming
c. Lemmatization
d. POS tagging
e. Lexical analysis
2. Sentiment analysis on customer review on products
3. Web analytics
a. Web usage data (web server log data, clickstream analysis)
b. Hyperlink data
4. Search engine optimization- implement spamdexing
5. Use Google analytics tools to implement the following
a. Conversion Statistics
b. Visitor Profiles
6. Use Google analytics tools to implement the Traffic Sources.
Resources:
1. Stanford core NLP package
2. GOOGLE.COM/ANALYTICS
TEXT BOOKS:
1. Ramesh Sharda, Dursun Delen, Efraim Turban, BUSINESS INTELLIGENCE AND
ANALYTICS: SYSTEMS FOR DECISION SUPPORT, Pearson Education.
REFERENCE BOOKS:
1. Rajiv Sabherwal, Irma Becerra- Fernandez,” Business Intelligence –
Practice,Technologies and Management”, John Wiley 2011.
2. Lariss T. Moss, Shaku Atre, “Business Intelligence Roadmap”, Addison-Wesley It Service.
3. Yuli Vasiliev, “Oracle Business Intelligence: The Condensed Guide to Analysis and Reporting”,
SPD Shroff, 2012.
1. Preprocessing text document using NLTK of Python
a. Stopword elimination
import nltk
def stopword_elimination(text):
stopwords = nltk.corpus.stopwords.words('english')
filtered_words = [word for word in text if word not in stopwords]
return filtered_words
if __name__ == '__main__':
text = "This is a sample text with stopwords."
filtered_words = stopword_elimination(text)
print(filtered_words)
Output
['This', 'sample', 'text', 'with']
b) Stemming
import nltk
from nltk.stem import PorterStemmer
def stemming(text):
stemmer = PorterStemmer()
stemmed_words = []
for word in text:
stemmed_words.append(stemmer.stem(word))
return stemmed_words
if __name__ == '__main__':
text = "This is a sample text with stemming."
stemmed_words = stemming(text)
print(stemmed_words)
Output
python stemming.py
['this', 'sampl', 'text', 'stem']
C) Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
def lemmatization(text):
lemmatizer = WordNetLemmatizer()
lemmatized_words = []
for word in text:
lemmatized_words.append(lemmatizer.lemmatize(word))
return lemmatized_words
if __name__ == '__main__':
text = "This is a sample text with lemmatization."
lemmatized_words = lemmatization(text)
print(lemmatized_words)
Output
python lemmatization.py
['this', 'sample', 'text', 'lemmatization']
D) POS tagging
import nltk
def pos_tagging(text):
tagged_words = nltk.pos_tag(text)
return tagged_words
if __name__ == '__main__':
text = "This is a sample text with POS tagging."
tagged_words = pos_tagging(text)
print(tagged_words)
Output
python pos_tagging.py
[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'NN'), ('text', 'NN'), ('with', 'IN'), ('POS', 'NN'), ('tagging', 'VBG')]
E) Lexical analysis
import nltk
def lexical_analysis(text):
tokens = nltk.word_tokenize(text)
tagged_tokens = nltk.pos_tag(tokens)
return tagged_tokens
if __name__ == '__main__':
text = "This is a sample text with lexical analysis."
tagged_tokens = lexical_analysis(text)
print(tagged_tokens)
Output
python lexical_analysis.py
[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'NN'), ('text', 'NN'), ('with', 'IN'), ('lexical', 'JJ'), ('analysis', 'NN')]
2. Sentiment analysis on customer review on products
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def sentiment_analysis(text):
analyzer = SentimentIntensityAnalyzer()
sentiment = analyzer.polarity_scores(text)
return sentiment
if __name__ == '__main__':
text = "This is a sample text with positive sentiment."
sentiment = sentiment_analysis(text)
print(sentiment)
Output
python sentiment_analysis.py
{'neg': 0.0, 'neu': 0.1, 'pos': 0.9, 'compound': 0.9306}
3. Web analytics
a. Web usage data (web server log data, clickstream analysis)
import pandas as pd
def web_usage_analysis(log_file):
log_data = pd.read_csv(log_file)
# Analyze the data
...
# Print the results
...
if __name__ == '__main__':
log_file = 'web_log.csv'
web_usage_analysis(log_file)
Output
python web_usage_analysis.py
The output of the program will depend on the data in the log file. However, the output might include the following
information:
import requests
import bs4
def hyperlink_analysis(url):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.content, 'html.parser')
links = soup.find_all('a')
# Analyze the links
link_counts = {}
for link in links:
anchor_text = link.text
url = link['href']
if url not in link_counts:
link_counts[url] = 0
link_counts[url] += 1
# Print the results
for url, count in link_counts.items():
print(f'{url}: {count}')
if __name__ == '__main__':
url = 'https://www.google.com/'
hyperlink_analysis(url)
Output
The output of the program will depend on the page at the URL that you specify. However, the output might include
the following information:
import nltk
def spamdexing(text):
stopwords = nltk.corpus.stopwords.words('english')
keywords = ['keyword1', 'keyword2', 'keyword3']
filtered_text = [word for word in text if word not in stopwords]
for keyword in keywords:
filtered_text.append(keyword * 10)
return filtered_text
if __name__ == '__main__':
text = "This is a sample text with stopwords."
filtered_text = spamdexing(text)
print(filtered_text)
Output
['This', 'is', 'a', 'sample', 'text', 'with', 'stopwords.', 'keyword1', 'keyword1', 'keyword1', 'keyword1',
'keyword1', 'keyword1', 'keyword1', 'keyword1', 'keyword2', 'keyword2', 'keyword2', 'keyword2',
'keyword2', 'keyword2', 'keyword2', 'keyword2', 'keyword3', 'keyword3', 'keyword3', 'keyword3',
'keyword3', 'keyword3', 'keyword3', 'keyword3']
import requests
def get_conversion_data(conversion_id):
url = 'https://analytics.google.com/analytics/v3/data/ga?'
params = {
'ids': 'ga:{conversion_id}',
'start-date': '2023-01-01',
'end-date': '2023-08-01',
'metrics': 'ga:conversions',
'dimensions': 'ga:date',
'samplingLevel': '1'
}
response = requests.get(url, params=params)
return response.json()
if __name__ == '__main__':
conversion_id = '1234567890'
conversion_data = get_conversion_data(conversion_id)
print(conversion_data)
output
python conversion_tracking.py
The output of the program will depend on the data in the data file. However, the output might include the following
information:
import requests
def get_visitor_profiles(profile_ids):
url = 'https://analytics.google.com/analytics/v3/data/ga?'
params = {
'ids': 'ga:{profile_ids}',
'start-date': '2023-01-01',
'end-date': '2023-08-01',
'metrics': 'ga:sessions,ga:bounceRate,ga:pageviews',
'dimensions': 'ga:source,ga:medium,ga:deviceCategory',
'samplingLevel': '1'
}
response = requests.get(url, params=params)
return response.json()
if __name__ == '__main__':
profile_ids = '1234567890,1234567891'
visitor_profiles = get_visitor_profiles(profile_ids)
print(visitor_profiles)
output
python visitor_profiles.py
import requests
def get_traffic_sources(profile_id):
url = 'https://analytics.google.com/analytics/v3/data/ga?'
params = {
'ids': 'ga:{profile_id}',
'start-date': '2023-01-01',
'end-date': '2023-08-01',
'metrics': 'ga:sessions',
'dimensions': 'ga:source,ga:medium',
'samplingLevel': '1'
}
response = requests.get(url, params=params)
return response.json()
if __name__ == '__main__':
profile_id = '1234567890'
traffic_sources = get_traffic_sources(profile_id)
print(traffic_sources)
Output
python traffic_sources.py
{
"rows": [
{
"ga:sessions": 100,
"ga:source": "google",
"ga:medium": "organic"
},
{
"ga:sessions": 50,
"ga:source": "facebook",
"ga:medium": "social"
},
{
"ga:sessions": 20,
"ga:source": "twitter",
"ga:medium": "social"
},
{
"ga:sessions": 10,
"ga:source": "direct",
"ga:medium": "none"
}
]
}