001/**
002 * Copyright 2015 Tampere University of Technology, Pori Department
003 * 
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 * 
008 *   http://www.apache.org/licenses/LICENSE-2.0
009 * 
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package service.tut.pori.fuzzyvisuals;
017
018import java.io.Closeable;
019import java.io.IOException;
020import java.io.InputStream;
021import java.nio.charset.Charset;
022import java.util.ArrayList;
023import java.util.HashSet;
024import java.util.List;
025import java.util.Set;
026
027import org.apache.commons.io.IOUtils;
028import org.apache.commons.lang3.ArrayUtils;
029import org.apache.commons.lang3.StringUtils;
030import org.apache.http.StatusLine;
031import org.apache.http.client.entity.UrlEncodedFormEntity;
032import org.apache.http.client.methods.CloseableHttpResponse;
033import org.apache.http.client.methods.HttpGet;
034import org.apache.http.client.methods.HttpPost;
035import org.apache.http.impl.client.CloseableHttpClient;
036import org.apache.http.impl.client.HttpClients;
037import org.apache.http.message.BasicNameValuePair;
038import org.apache.log4j.Logger;
039
040import core.tut.pori.context.ServiceInitializer;
041import core.tut.pori.utils.HTTPHeaderUtil;
042
043/**
044 * Fuzzy content analyzer.
045 * 
046 * Note that this uses Google's Translation API scraped from the web page (<a href="https://translate.google.com/">Google Translator</a>).
047 * Using the results for ANY official or commercial use is most likely against <a href="http://www.google.com/intl/en/policies/terms/">Google's Terms of Use</a>.
048 * 
049 * The code provided in this class is provided ONLY for testing purposes. See <a href="https://cloud.google.com/translate/docs">Translate API</a> for examples and documentation.
050 * 
051 * Note: this class is NOT thread-safe.
052 * 
053 */
054public class FuzzyAnalyzer implements Closeable {
055  private static final int CONTENT_BUFFER_SIZE = 1000;
056  private static final String GOOGLE_TRANSLATE_URL = "https://translate.google.com/translate_a/single?client=t&sl=zh-CN&tl=en&hl=en&dt=t&ie=UTF-8&oe=UTF-8"; // for real use one should provide an API key
057  private static final Logger LOGGER = Logger.getLogger(FuzzyAnalyzer.class);
058  private static final String PARAMETER_Q = "q";
059  private CloseableHttpClient _client = null;
060  private Charset _utf16 = null;
061  private Charset _utf8 = null;
062
063  /**
064   * 
065   */
066  public FuzzyAnalyzer(){
067    _client = HttpClients.createDefault();
068    _utf8 = Charset.forName("UTF-8");
069    _utf16 = Charset.forName("UTF-16");
070  }
071
072  /**
073   * 
074   * @param input
075   * @return set of words or null if none was extracted
076   * @throws IllegalArgumentException on bad data
077   */
078  public Set<String> analyze(InputStream input) throws IllegalArgumentException{
079    if(input == null){
080      throw new IllegalArgumentException("Invalid input : null");
081    }
082    
083    try{
084      byte[] array = new byte[CONTENT_BUFFER_SIZE];
085      if(IOUtils.read(input, array) < 0){ // discard the first 1000 characters to get different results for each content, this is generally magic bytes for the file type
086        throw new IllegalArgumentException("File is too small : less than "+(CONTENT_BUFFER_SIZE*2)+" bytes.");
087      }
088
089      IOUtils.read(input, array); // read the actual content
090
091      String data = new String(array, _utf16); // convert to UTF-18 to get Chinese characters of the bytes
092      LOGGER.debug("Converted to "+_utf16.name()+" : "+data);
093
094      HttpPost post = new HttpPost(GOOGLE_TRANSLATE_URL); 
095      List<BasicNameValuePair> parameters = new ArrayList<>(1);
096      parameters.add(new BasicNameValuePair(PARAMETER_Q, data));
097      post.setEntity(new UrlEncodedFormEntity(parameters, _utf8));
098      
099      LOGGER.debug("Calling "+GOOGLE_TRANSLATE_URL);
100      try(CloseableHttpResponse r = _client.execute(post)){
101        StatusLine l = r.getStatusLine();
102        int status = l.getStatusCode();
103        if(status < 200 || status >= 300){
104          throw new IllegalArgumentException("Translation server error : "+status+" "+l.getReasonPhrase());
105        }
106        
107        String[] words = StringUtils.split(IOUtils.toString(r.getEntity().getContent())); // the response is JSON, but we can simple split everything from whitespace
108        if(ArrayUtils.isEmpty(words)){
109          LOGGER.debug("No results.");
110          return null;
111        }
112        
113        HashSet<String> finalWords = new HashSet<>(words.length);
114        for(int i=0;i<words.length;++i){
115          if(words[i].length() > 3 && StringUtils.isAsciiPrintable(words[i]) && StringUtils.isAllLowerCase(words[i])){ // filter out everything not proper English words
116            finalWords.add(words[i]);
117          }
118        }
119        return finalWords;
120      }
121    } catch (IOException ex) {
122      LOGGER.error(ex, ex);
123    }
124    return null;
125  }
126
127  /**
128   * 
129   * @param url
130   * @return set of words or null if none was extracted
131   * @throws IllegalArgumentException on bad data
132   */
133  public Set<String> analyze(String url) throws IllegalArgumentException{
134    HttpGet get = new HttpGet(url);
135    FuzzyProperties fp = ServiceInitializer.getPropertyHandler().getSystemProperties(FuzzyProperties.class);
136    String username = fp.getAuthUsername();
137    if(username != null){ // checking either password or username is OK
138      LOGGER.debug("Using authentication...");
139      HTTPHeaderUtil.setHTTPBasicAuthHeader(get, username, fp.getAuthPassword());
140    }
141    
142    LOGGER.debug("Calling GET "+url);
143    try(CloseableHttpResponse r = _client.execute(get)){
144      StatusLine l = r.getStatusLine();
145      int status = l.getStatusCode();
146      if(status < 200 || status >= 300){
147        throw new IllegalArgumentException("Failed to retrieve file : "+status+" "+l.getReasonPhrase());
148      }
149      
150      return analyze(r.getEntity().getContent());
151    } catch (IOException ex) {
152      LOGGER.error(ex, ex);
153    }
154    return null;
155  }
156
157  @Override
158  public void close() {
159    try {
160      _client.close();
161    } catch (IOException ex) {
162      LOGGER.error(ex, ex);
163    }
164  }
165}