Change extraction method using API and prepare package for v2.1 release

clarencecastillo · Apr 8, 2020 · 5095b52 · 5095b52
1 parent a0c9f0f
commit 5095b52
Show file tree

Hide file tree

Showing 15 changed files with 368 additions and 83 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,5 +3,7 @@ __pycache__/
 *$py.class
 
 /src/lib
+/src/workflow
+/venv
 
 .python-version
diff --git a/.python-version b/.python-version
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2019 Clarence Castillo
+Copyright (c) 2020 Clarence Castillo
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/Powerthesaurus-2.0.alfredworkflow b/Powerthesaurus-2.0.alfredworkflow
diff --git a/Powerthesaurus-2.1.0.alfredworkflow b/Powerthesaurus-2.1.0.alfredworkflow
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Powerthesaurus Search for Alfred #
 
-Search for synonyms and antonyms on [Powerthesaurus.org](https://www.powerthesaurus.org) from [Alfred 3 & 4](https://www.alfredapp.com/).
+Search for synonyms and antonyms on [Powerthesaurus.org](https://www.powerthesaurus.org) from [Alfred 4](https://www.alfredapp.com/).
 
 ![](demo.gif "")
 

diff --git a/init.sh b/init.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+pip install --target=src Alfred-Workflow --upgrade
+rm -r src/*.dist-info
+mkdir src/lib
+pip install -r src/requirements.txt --target=src/lib --upgrade
diff --git a/src/api.py b/src/api.py
@@ -1,38 +1,120 @@
-from bs4 import BeautifulSoup
 import requests
 import logging
 import json
+import os
 
 class PowerThesaurus:
 
-    def __init__(self, api_url, logger=logging):
+    USER_AGENT = "Alfred-Powerthesaurus/2.1.0"
+    GQL_THESAURUS_QUERY = "thesaurus_query"
+    GQL_SEARCH_QUERY = "search_query"
+
+    def __init__(self, api_url, web_url, gql_queries_dir="./gql_queries/", pos_file_path="./pos.json", logger=logging):
         self.api_url = api_url
+        self.web_url = web_url
         self.logger = logger
+        self.gql_queries = self.read_gql_queries(gql_queries_dir)
+        self.pos_mapping = self.read_pos_mapping(pos_file_path)
+        self.request_headers = self.build_request_headers()
+
+    def build_url(self, slug, query_type):
+        return '{}/{}/{}'.format(self.web_url, slug, query_type)
 
-    def parse_term(self, term_data):
+    def build_request_headers(self):
         return {
-            'term': term_data['term'],
-            'topics': [t['topic'] for t in term_data['topics']],
-            'rating': int(term_data['rating']),
-            'parts': [p['short_name'] for p in term_data['parts']]
+            "user-agent": PowerThesaurus.USER_AGENT,
+            "content-type": "application/json"
         }
 
-    def extract_terms(self, page_text):
-        soup = BeautifulSoup(page_text, 'html.parser')
-        script = soup.find('script', src='').getText().strip().split('\n')[0]
-        data = json.loads(script[script.find('{'):-1])
+    def read_pos_mapping(self, file_path):
+        pos_mapping = {}
+        with open(file_path, 'r') as file:
+            pos_list = json.loads(file.read())
+            for pos in pos_list:
+                pos_mapping[pos['id']] = pos
+        return pos_mapping
 
-        if 'list' not in data:
-            return []
+    def read_gql_queries(self, dir):
+        gql_queries = {}
+        files = os.listdir(dir)
+        for filename in files:
+            file_path = os.path.join(dir, filename)
+            with open(file_path, 'r') as file:
+                # get filename without ext
+                key = os.path.splitext(filename)[0]
+                gql_queries[key] = file.read()
+        return gql_queries
 
-        return data['list']['pages'][0]['terms']
+    def build_search_query_params(self, query):
+        return {
+            "operationName": "SEARCH_QUERY",
+            "variables": {
+                "query": query
+            },
+            "query": self.gql_queries[PowerThesaurus.GQL_SEARCH_QUERY]
+        }
 
-    def build_search_url(self, word, query_type):
-        return '{}/{}/{}'.format(self.api_url, word.replace(' ', '_'), query_type)
+    def build_thesaurus_query_params(self, term_id, query_type):
+        return {
+            "operationName": "THESAURUSES_QUERY",
+            "variables": {
+                "list": query_type.upper(),
+                "termID": term_id,
+                "sort": {
+                    "field": "RATING",
+                    "direction": "DESC"
+                },
+                "limit": 50,
+                "syllables": None,
+                "query": None,
+                "posID": None,
+                "first": 50,
+                "after": ""
+            },
+            "query": self.gql_queries[PowerThesaurus.GQL_THESAURUS_QUERY]
+        }
 
-    def search(self, word, query_type):
-        r = requests.get(self.build_search_url(word, query_type), headers={'user-agent': 'alfred-powerthesaurus/2.0'})
-        self.logger.debug('response : {} {}'.format(r.status_code, r.url))
+    def parse_thesaurus_query_response(self, response):
+        edges = response['data']['thesauruses']['edges']
+        results = map(lambda e : e['node'], edges)
+        return map(lambda r : {
+            'id': r['targetTerm']['id'],
+            'word': r['targetTerm']['name'],
+            'slug': r['targetTerm']['slug'],
+            'parts_of_speech': map(lambda p : self.pos_mapping[p]['shorter'], r['relations']['parts_of_speech']),
+            'tags': r['relations']['tags'],
+            'synonyms_count': r['targetTerm']['counters']['synonyms'],
+            'antonyms_count': r['targetTerm']['counters']['antonyms'],
+            'rating': r['rating'],
+            'url_synonyms': self.build_url(r['targetTerm']['slug'], 'synonyms'),
+            'url_antonyms': self.build_url(r['targetTerm']['slug'], 'antonyms')
+            }, results)
+
+    def thesaurus_query(self, term_id, query_type):
+        if not term_id:
+            return []
+        params = self.build_thesaurus_query_params(term_id, query_type)
+        r = requests.post(self.api_url, json=params, headers=self.request_headers)
+        self.logger.debug('thesaurus_query: {} {}'.format(r.status_code, r.url))
+        r.raise_for_status()
+        return self.parse_thesaurus_query_response(r.json())
+
+    def parse_search_query_response(self, response):
+        terms = response['data']['search']['terms']
+        return map(lambda t : {
+            'id': t['id'],
+            'word': t['name'],
+            }, terms)
+
+    def search_query(self, query):
+        params = self.build_search_query_params(query)
+        r = requests.post(self.api_url, json=params, headers=self.request_headers)
+        self.logger.debug('search_query: {} {}'.format(r.status_code, r.url))
         r.raise_for_status()
-        data = self.extract_terms(r.text)
-        return [self.parse_term(t) for t in data]
+        return self.parse_search_query_response(r.json())
+
+    def search_query_match(self, query):
+        terms = self.search_query(query)
+        if not terms or terms[0]['word'] != query:
+            return None
+        return terms[0]
diff --git a/src/gql_queries/search_query.txt b/src/gql_queries/search_query.txt
@@ -0,0 +1,8 @@
+query SEARCH_QUERY($query: String!) {
+  search(query: $query) {
+    terms {
+      id
+      name
+    }
+  }
+}
diff --git a/src/gql_queries/thesaurus_query.txt b/src/gql_queries/thesaurus_query.txt
@@ -0,0 +1,39 @@
+query THESAURUSES_QUERY($after: String, $first: Int, $before: String, $last: Int, $termID: ID!, $list: List!, $sort: ThesaurusSorting!, $tagID: Int, $posID: Int, $syllables: Int) {
+  thesauruses(termId: $termID, sort: $sort, list: $list, after: $after, first: $first, before: $before, last: $last, tagId: $tagID, partOfSpeechId: $posID, syllables: $syllables) {
+    totalCount
+    pageInfo {
+      hasNextPage
+      hasPreviousPage
+      startCursor
+      endCursor
+      __typename
+    }
+    edges {
+      node {
+        _type
+        id
+        isPinned
+        targetTerm {
+          id
+          name
+          slug
+          counters
+          isFavorite
+          __typename
+          }
+        relations
+        rating
+        vote {
+          voteType
+          id
+          __typename
+          }
+        votes
+        __typename
+      }
+      cursor
+      __typename
+      }
+    __typename
+  }
+}